From de679b3bbe8b810b01929b1f27665ede5eb54f37 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 24 Mar 2016 10:39:14 -0700 Subject: [PATCH 0001/3220] mm: Export do_munmap The 0-day build bot reports the following build error, seen if SDCARD_FS is built as module. ERROR: "do_munmap" undefined! Fixes: 84a1b7d3d312 ("Included sdcardfs source code for kernel 3.0") Reported-by: Fengguang Wu Signed-off-by: Guenter Roeck --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mmap.c b/mm/mmap.c index 2b1143008793..ee856266cc61 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2639,6 +2639,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) return 0; } +EXPORT_SYMBOL(do_munmap); int vm_munmap(unsigned long start, size_t len) { From a9c7ddcf0347675f8bc681830d21d27df2ed8cb1 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 25 Nov 2015 16:03:31 -0500 Subject: [PATCH 0002/3220] UPSTREAM: dm: don't save and restore bi_private Device mapper used the field bi_private to point to dm_target_io. However, since kernel 3.15, the bi_private field is unused, and so the targets do not need to save and restore this field. This patch removes code that saves and restores bi_private from dm-cache, dm-snapshot and dm-verity. Change-Id: Ic72905ccb6d58ff94eafaa47ba54b2688d92d3d1 Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer (cherry picked from commit fe3265b180d6282648f03bc6ac3958c733df01c2) --- drivers/md/dm-cache-target.c | 3 --- drivers/md/dm-snap.c | 6 +----- drivers/md/dm-verity.c | 5 +---- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2fd4c8296144..5780accffa30 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -118,14 +118,12 @@ static void iot_io_end(struct io_tracker *iot, sector_t len) */ struct dm_hook_info { bio_end_io_t *bi_end_io; - void *bi_private; }; static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, bio_end_io_t *bi_end_io, void *bi_private) { h->bi_end_io = bio->bi_end_io; - h->bi_private = bio->bi_private; bio->bi_end_io = bi_end_io; bio->bi_private = bi_private; @@ -134,7 +132,6 @@ static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) { bio->bi_end_io = h->bi_end_io; - bio->bi_private = h->bi_private; } /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c06b74e91cd6..f68d0ae5b198 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -207,7 +207,6 @@ struct dm_snap_pending_exception { */ struct bio *full_bio; bio_end_io_t *full_bio_end_io; - void *full_bio_private; }; /* @@ -1485,10 +1484,8 @@ out: snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); full_bio = pe->full_bio; - if (full_bio) { + if (full_bio) full_bio->bi_end_io = pe->full_bio_end_io; - full_bio->bi_private = pe->full_bio_private; - } increment_pending_exceptions_done_count(); up_write(&s->lock); @@ -1605,7 +1602,6 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, pe->full_bio = bio; pe->full_bio_end_io = bio->bi_end_io; - pe->full_bio_private = bio->bi_private; callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, copy_callback, pe); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index ccf41886ebcf..9e8891507c1c 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -83,9 +83,8 @@ struct dm_verity { struct dm_verity_io { struct dm_verity *v; - /* original values of bio->bi_end_io and bio->bi_private */ + /* original value of bio->bi_end_io */ bio_end_io_t *orig_bi_end_io; - void *orig_bi_private; sector_t block; unsigned n_blocks; @@ -453,7 +452,6 @@ static void verity_finish_io(struct dm_verity_io *io, int error) struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); bio->bi_end_io = io->orig_bi_end_io; - bio->bi_private = io->orig_bi_private; bio->bi_error = error; bio_endio(bio); @@ -566,7 +564,6 @@ static int verity_map(struct dm_target *ti, struct bio *bio) io = dm_per_bio_data(bio, ti->per_bio_data_size); io->v = v; io->orig_bi_end_io = bio->bi_end_io; - io->orig_bi_private = bio->bi_private; io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits; From ab14138e109c1d292c2eb28bdb8f701cf34a86a0 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 5 Nov 2015 02:02:31 +0000 Subject: [PATCH 0003/3220] UPSTREAM: dm verity: clean up duplicate hashing code Handle dm-verity salting in one place to simplify the code. Change-Id: If923a01dc63ae5123af13ba1b0863b73e33ddf46 Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Signed-off-by: Mike Snitzer (cherry picked from commit 6dbeda3469ced777bc3138ed5918f7ae79670b7b) --- drivers/md/dm-verity.c | 266 +++++++++++++++++++++++------------------ 1 file changed, 149 insertions(+), 117 deletions(-) diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 9e8891507c1c..24517055bd8e 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -172,6 +172,84 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, return block >> (level * v->hash_per_block_bits); } +/* + * Wrapper for crypto_shash_init, which handles verity salting. + */ +static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc) +{ + int r; + + desc->tfm = v->tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(desc); + + if (unlikely(r < 0)) { + DMERR("crypto_shash_init failed: %d", r); + return r; + } + + if (likely(v->version >= 1)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + + if (unlikely(r < 0)) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + return 0; +} + +static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len) +{ + int r = crypto_shash_update(desc, data, len); + + if (unlikely(r < 0)) + DMERR("crypto_shash_update failed: %d", r); + + return r; +} + +static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc, + u8 *digest) +{ + int r; + + if (unlikely(!v->version)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + r = crypto_shash_final(desc, digest); + + if (unlikely(r < 0)) + DMERR("crypto_shash_final failed: %d", r); + + return r; +} + +static int verity_hash(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len, u8 *digest) +{ + int r; + + r = verity_hash_init(v, desc); + if (unlikely(r < 0)) + return r; + + r = verity_hash_update(v, desc, data, len); + if (unlikely(r < 0)) + return r; + + return verity_hash_final(v, desc, digest); +} + static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, sector_t *hash_block, unsigned *offset) { @@ -252,10 +330,10 @@ out: * If "skip_unverified" is false, unverified buffer is hashed and verified * against current value of io_want_digest(v, io). */ -static int verity_verify_level(struct dm_verity_io *io, sector_t block, - int level, bool skip_unverified) +static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, int level, bool skip_unverified, + u8 *want_digest) { - struct dm_verity *v = io->v; struct dm_buffer *buf; struct buffer_aux *aux; u8 *data; @@ -272,74 +350,71 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block, aux = dm_bufio_get_aux_data(buf); if (!aux->hash_verified) { - struct shash_desc *desc; - u8 *result; - if (skip_unverified) { r = 1; goto release_ret_r; } - desc = io_hash_desc(v, io); - desc->tfm = v->tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - r = crypto_shash_init(desc); - if (r < 0) { - DMERR("crypto_shash_init failed: %d", r); + r = verity_hash(v, io_hash_desc(v, io), + data, 1 << v->hash_dev_block_bits, + io_real_digest(v, io)); + if (unlikely(r < 0)) goto release_ret_r; - } - if (likely(v->version >= 1)) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - } - - r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - - if (!v->version) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - goto release_ret_r; - } - } - - result = io_real_digest(v, io); - r = crypto_shash_final(desc, result); - if (r < 0) { - DMERR("crypto_shash_final failed: %d", r); - goto release_ret_r; - } - if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { - if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA, - hash_block)) { - r = -EIO; - goto release_ret_r; - } - } else + if (likely(memcmp(io_real_digest(v, io), want_digest, + v->digest_size) == 0)) aux->hash_verified = 1; + else if (verity_handle_err(v, + DM_VERITY_BLOCK_TYPE_METADATA, + hash_block)) { + r = -EIO; + goto release_ret_r; + } } data += offset; - - memcpy(io_want_digest(v, io), data, v->digest_size); - - dm_bufio_release(buf); - return 0; + memcpy(want_digest, data, v->digest_size); + r = 0; release_ret_r: dm_bufio_release(buf); - return r; } +/* + * Find a hash for a given block, write it to digest and verify the integrity + * of the hash tree if necessary. + */ +static int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, u8 *digest) +{ + int i; + int r; + + if (likely(v->levels)) { + /* + * First, we try to get the requested hash for + * the current block. If the hash block itself is + * verified, zero is returned. If it isn't, this + * function returns 1 and we fall back to whole + * chain verification. + */ + r = verity_verify_level(v, io, block, 0, true, digest); + if (likely(r <= 0)) + return r; + } + + memcpy(digest, v->root_digest, v->digest_size); + + for (i = v->levels - 1; i >= 0; i--) { + r = verity_verify_level(v, io, block, i, false, digest); + if (unlikely(r)) + return r; + } + + return 0; +} + /* * Verify one "dm_verity_io" structure. */ @@ -349,54 +424,21 @@ static int verity_verify_io(struct dm_verity_io *io) struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); unsigned b; - int i; for (b = 0; b < io->n_blocks; b++) { - struct shash_desc *desc; - u8 *result; int r; unsigned todo; + struct shash_desc *desc = io_hash_desc(v, io); - if (likely(v->levels)) { - /* - * First, we try to get the requested hash for - * the current block. If the hash block itself is - * verified, zero is returned. If it isn't, this - * function returns 0 and we fall back to whole - * chain verification. - */ - int r = verity_verify_level(io, io->block + b, 0, true); - if (likely(!r)) - goto test_block_hash; - if (r < 0) - return r; - } - - memcpy(io_want_digest(v, io), v->root_digest, v->digest_size); - - for (i = v->levels - 1; i >= 0; i--) { - int r = verity_verify_level(io, io->block + b, i, false); - if (unlikely(r)) - return r; - } - -test_block_hash: - desc = io_hash_desc(v, io); - desc->tfm = v->tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - r = crypto_shash_init(desc); - if (r < 0) { - DMERR("crypto_shash_init failed: %d", r); + r = verity_hash_for_block(v, io, io->block + b, + io_want_digest(v, io)); + if (unlikely(r < 0)) + return r; + + r = verity_hash_init(v, desc); + if (unlikely(r < 0)) return r; - } - if (likely(v->version >= 1)) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - } todo = 1 << v->data_dev_block_bits; do { u8 *page; @@ -407,37 +449,27 @@ test_block_hash: len = bv.bv_len; if (likely(len >= todo)) len = todo; - r = crypto_shash_update(desc, page + bv.bv_offset, len); + r = verity_hash_update(v, desc, page + bv.bv_offset, + len); kunmap_atomic(page); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); + if (unlikely(r < 0)) return r; - } bio_advance_iter(bio, &io->iter, len); todo -= len; } while (todo); - if (!v->version) { - r = crypto_shash_update(desc, v->salt, v->salt_size); - if (r < 0) { - DMERR("crypto_shash_update failed: %d", r); - return r; - } - } - - result = io_real_digest(v, io); - r = crypto_shash_final(desc, result); - if (r < 0) { - DMERR("crypto_shash_final failed: %d", r); + r = verity_hash_final(v, desc, io_real_digest(v, io)); + if (unlikely(r < 0)) return r; - } - if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { - if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - io->block + b)) - return -EIO; - } + + if (likely(memcmp(io_real_digest(v, io), + io_want_digest(v, io), v->digest_size) == 0)) + continue; + else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, + io->block + b)) + return -EIO; } return 0; From 7475611ceceb9eb0cfd31f44d2c820444e1276d0 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 5 Nov 2015 02:02:32 +0000 Subject: [PATCH 0004/3220] UPSTREAM: dm verity: separate function for parsing opt args Move optional argument parsing into a separate function to make it easier to add more of them without making verity_ctr even longer. Change-Id: I9cd9df41c3326824f8cca5764075501987e78a52 Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Signed-off-by: Mike Snitzer (cherry picked from commit 753c1fd02807cb43a1c5d01d75d454054d46bdad) --- drivers/md/dm-verity.c | 71 +++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 24517055bd8e..b0a53c3b926d 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -34,6 +34,8 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" +#define DM_VERITY_OPTS_MAX 1 + static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); @@ -721,6 +723,44 @@ static void verity_dtr(struct dm_target *ti) kfree(v); } +static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) +{ + int r; + unsigned argc; + struct dm_target *ti = v->ti; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"}, + }; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + if (!argc) + return 0; + + do { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) { + v->mode = DM_VERITY_MODE_LOGGING; + continue; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) { + v->mode = DM_VERITY_MODE_RESTART; + continue; + } + + ti->error = "Unrecognized verity feature request"; + return -EINVAL; + } while (argc && !r); + + return r; +} + /* * Target parameters: * The current format is version 1. @@ -739,18 +779,13 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { struct dm_verity *v; struct dm_arg_set as; - const char *opt_string; - unsigned int num, opt_params; + unsigned int num; unsigned long long num_ll; int r; int i; sector_t hash_position; char dummy; - static struct dm_arg _args[] = { - {0, 1, "Invalid number of feature args"}, - }; - v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); if (!v) { ti->error = "Cannot allocate verity structure"; @@ -895,29 +930,9 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) as.argc = argc; as.argv = argv; - r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); - if (r) + r = verity_parse_opt_args(&as, v); + if (r < 0) goto bad; - - while (opt_params) { - opt_params--; - opt_string = dm_shift_arg(&as); - if (!opt_string) { - ti->error = "Not enough feature arguments"; - r = -EINVAL; - goto bad; - } - - if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING)) - v->mode = DM_VERITY_MODE_LOGGING; - else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART)) - v->mode = DM_VERITY_MODE_RESTART; - else { - ti->error = "Invalid feature arguments"; - r = -EINVAL; - goto bad; - } - } } v->hash_per_block_bits = From 6558fe3cbe58107b887fb449faf7cf0295f65932 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 3 Dec 2015 15:36:00 -0500 Subject: [PATCH 0005/3220] UPSTREAM: dm verity: move dm-verity.c to dm-verity-target.c Prepare for extending dm-verity with an optional object. Follows the naming convention used by other DM targets (e.g. dm-cache and dm-era). Change-Id: If6d2f27b290adf14fa77f3745fdc13aaa417c8dc Signed-off-by: Sami Tolvanen Signed-off-by: Mike Snitzer (cherry picked from commit 03045cbafa2d663ad8d0a583ac219d202d824344) --- drivers/md/Makefile | 1 + drivers/md/{dm-verity.c => dm-verity-target.c} | 0 2 files changed, 1 insertion(+) rename drivers/md/{dm-verity.c => dm-verity-target.c} (100%) diff --git a/drivers/md/Makefile b/drivers/md/Makefile index f34979cd141a..94e9f6bb33d1 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -16,6 +16,7 @@ dm-cache-mq-y += dm-cache-policy-mq.o dm-cache-smq-y += dm-cache-policy-smq.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o +dm-verity-y += dm-verity-target.o md-mod-y += md.o bitmap.o raid456-y += raid5.o raid5-cache.o diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity-target.c similarity index 100% rename from drivers/md/dm-verity.c rename to drivers/md/dm-verity-target.c From 3ec912f8ef374d0805eaf496535aaf51b159e355 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 3 Dec 2015 16:01:51 -0500 Subject: [PATCH 0006/3220] UPSTREAM: dm verity: factor out structures and functions useful to separate object Prepare for an optional verity object to make use of existing dm-verity structures and functions. Change-Id: Ib14c3834bfed222b33e068908fb5f71a53e1187b Signed-off-by: Sami Tolvanen Signed-off-by: Mike Snitzer (cherry picked from commit ffa393807cd69656d5b6bc9d9622e205071cbab8) --- drivers/md/dm-verity-target.c | 116 +++++----------------------------- drivers/md/dm-verity.h | 112 ++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 100 deletions(-) create mode 100644 drivers/md/dm-verity.h diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index b0a53c3b926d..7e200ba631fb 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -14,12 +14,10 @@ * access behavior. */ -#include "dm-bufio.h" +#include "dm-verity.h" #include -#include #include -#include #define DM_MSG_PREFIX "verity" @@ -28,7 +26,6 @@ #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 -#define DM_VERITY_MAX_LEVELS 63 #define DM_VERITY_MAX_CORRUPTED_ERRS 100 #define DM_VERITY_OPT_LOGGING "ignore_corruption" @@ -40,72 +37,6 @@ static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); -enum verity_mode { - DM_VERITY_MODE_EIO, - DM_VERITY_MODE_LOGGING, - DM_VERITY_MODE_RESTART -}; - -enum verity_block_type { - DM_VERITY_BLOCK_TYPE_DATA, - DM_VERITY_BLOCK_TYPE_METADATA -}; - -struct dm_verity { - struct dm_dev *data_dev; - struct dm_dev *hash_dev; - struct dm_target *ti; - struct dm_bufio_client *bufio; - char *alg_name; - struct crypto_shash *tfm; - u8 *root_digest; /* digest of the root block */ - u8 *salt; /* salt: its size is salt_size */ - unsigned salt_size; - sector_t data_start; /* data offset in 512-byte sectors */ - sector_t hash_start; /* hash start in blocks */ - sector_t data_blocks; /* the number of data blocks */ - sector_t hash_blocks; /* the number of hash blocks */ - unsigned char data_dev_block_bits; /* log2(data blocksize) */ - unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ - unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ - unsigned char levels; /* the number of tree levels */ - unsigned char version; - unsigned digest_size; /* digest size for the current hash algorithm */ - unsigned shash_descsize;/* the size of temporary space for crypto */ - int hash_failed; /* set to 1 if hash of any block failed */ - enum verity_mode mode; /* mode for handling verification errors */ - unsigned corrupted_errs;/* Number of errors for corrupted blocks */ - - struct workqueue_struct *verify_wq; - - /* starting blocks for each tree level. 0 is the lowest level. */ - sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; -}; - -struct dm_verity_io { - struct dm_verity *v; - - /* original value of bio->bi_end_io */ - bio_end_io_t *orig_bi_end_io; - - sector_t block; - unsigned n_blocks; - - struct bvec_iter iter; - - struct work_struct work; - - /* - * Three variably-size fields follow this struct: - * - * u8 hash_desc[v->shash_descsize]; - * u8 real_digest[v->digest_size]; - * u8 want_digest[v->digest_size]; - * - * To access them use: io_hash_desc(), io_real_digest() and io_want_digest(). - */ -}; - struct dm_verity_prefetch_work { struct work_struct work; struct dm_verity *v; @@ -113,21 +44,6 @@ struct dm_verity_prefetch_work { unsigned n_blocks; }; -static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) -{ - return (struct shash_desc *)(io + 1); -} - -static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io) -{ - return (u8 *)(io + 1) + v->shash_descsize; -} - -static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io) -{ - return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; -} - /* * Auxiliary structure appended to each dm-bufio buffer. If the value * hash_verified is nonzero, hash of the block has been verified. @@ -236,8 +152,8 @@ static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc, return r; } -static int verity_hash(struct dm_verity *v, struct shash_desc *desc, - const u8 *data, size_t len, u8 *digest) +int verity_hash(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len, u8 *digest) { int r; @@ -325,12 +241,12 @@ out: * Verify hash of a metadata block pertaining to the specified data block * ("block" argument) at a specified level ("level" argument). * - * On successful return, io_want_digest(v, io) contains the hash value for - * a lower tree level or for the data block (if we're at the lowest leve). + * On successful return, verity_io_want_digest(v, io) contains the hash value + * for a lower tree level or for the data block (if we're at the lowest level). * * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. * If "skip_unverified" is false, unverified buffer is hashed and verified - * against current value of io_want_digest(v, io). + * against current value of verity_io_want_digest(v, io). */ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, sector_t block, int level, bool skip_unverified, @@ -357,13 +273,13 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, goto release_ret_r; } - r = verity_hash(v, io_hash_desc(v, io), + r = verity_hash(v, verity_io_hash_desc(v, io), data, 1 << v->hash_dev_block_bits, - io_real_digest(v, io)); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) goto release_ret_r; - if (likely(memcmp(io_real_digest(v, io), want_digest, + if (likely(memcmp(verity_io_real_digest(v, io), want_digest, v->digest_size) == 0)) aux->hash_verified = 1; else if (verity_handle_err(v, @@ -387,8 +303,8 @@ release_ret_r: * Find a hash for a given block, write it to digest and verify the integrity * of the hash tree if necessary. */ -static int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, - sector_t block, u8 *digest) +int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, u8 *digest) { int i; int r; @@ -430,10 +346,10 @@ static int verity_verify_io(struct dm_verity_io *io) for (b = 0; b < io->n_blocks; b++) { int r; unsigned todo; - struct shash_desc *desc = io_hash_desc(v, io); + struct shash_desc *desc = verity_io_hash_desc(v, io); r = verity_hash_for_block(v, io, io->block + b, - io_want_digest(v, io)); + verity_io_want_digest(v, io)); if (unlikely(r < 0)) return r; @@ -462,12 +378,12 @@ static int verity_verify_io(struct dm_verity_io *io) todo -= len; } while (todo); - r = verity_hash_final(v, desc, io_real_digest(v, io)); + r = verity_hash_final(v, desc, verity_io_real_digest(v, io)); if (unlikely(r < 0)) return r; - if (likely(memcmp(io_real_digest(v, io), - io_want_digest(v, io), v->digest_size) == 0)) + if (likely(memcmp(verity_io_real_digest(v, io), + verity_io_want_digest(v, io), v->digest_size) == 0)) continue; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, io->block + b)) diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h new file mode 100644 index 000000000000..c7ad4fd05188 --- /dev/null +++ b/drivers/md/dm-verity.h @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2015 Google, Inc. + * + * Author: Mikulas Patocka + * + * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors + * + * This file is released under the GPLv2. + */ + +#ifndef DM_VERITY_H +#define DM_VERITY_H + +#include "dm-bufio.h" +#include +#include + +#define DM_VERITY_MAX_LEVELS 63 + +enum verity_mode { + DM_VERITY_MODE_EIO, + DM_VERITY_MODE_LOGGING, + DM_VERITY_MODE_RESTART +}; + +enum verity_block_type { + DM_VERITY_BLOCK_TYPE_DATA, + DM_VERITY_BLOCK_TYPE_METADATA +}; + +struct dm_verity { + struct dm_dev *data_dev; + struct dm_dev *hash_dev; + struct dm_target *ti; + struct dm_bufio_client *bufio; + char *alg_name; + struct crypto_shash *tfm; + u8 *root_digest; /* digest of the root block */ + u8 *salt; /* salt: its size is salt_size */ + unsigned salt_size; + sector_t data_start; /* data offset in 512-byte sectors */ + sector_t hash_start; /* hash start in blocks */ + sector_t data_blocks; /* the number of data blocks */ + sector_t hash_blocks; /* the number of hash blocks */ + unsigned char data_dev_block_bits; /* log2(data blocksize) */ + unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ + unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ + unsigned char levels; /* the number of tree levels */ + unsigned char version; + unsigned digest_size; /* digest size for the current hash algorithm */ + unsigned shash_descsize;/* the size of temporary space for crypto */ + int hash_failed; /* set to 1 if hash of any block failed */ + enum verity_mode mode; /* mode for handling verification errors */ + unsigned corrupted_errs;/* Number of errors for corrupted blocks */ + + struct workqueue_struct *verify_wq; + + /* starting blocks for each tree level. 0 is the lowest level. */ + sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; +}; + +struct dm_verity_io { + struct dm_verity *v; + + /* original value of bio->bi_end_io */ + bio_end_io_t *orig_bi_end_io; + + sector_t block; + unsigned n_blocks; + + struct bvec_iter iter; + + struct work_struct work; + + /* + * Three variably-size fields follow this struct: + * + * u8 hash_desc[v->shash_descsize]; + * u8 real_digest[v->digest_size]; + * u8 want_digest[v->digest_size]; + * + * To access them use: verity_io_hash_desc(), verity_io_real_digest() + * and verity_io_want_digest(). + */ +}; + +static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (struct shash_desc *)(io + 1); +} + +static inline u8 *verity_io_real_digest(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize; +} + +static inline u8 *verity_io_want_digest(struct dm_verity *v, + struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; +} + +extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, + const u8 *data, size_t len, u8 *digest); + +extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, u8 *digest); + +#endif /* DM_VERITY_H */ From 890b7865c5f5e7e5b5bc6acaa4193d967fb8bea7 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 3 Dec 2015 16:30:36 -0500 Subject: [PATCH 0007/3220] UPSTREAM: dm verity: factor out verity_for_bv_block() verity_for_bv_block() will be re-used by optional dm-verity object. Change-Id: I80e0f8e7c9f234fce3fbdf21cb05aba3041d7f98 Signed-off-by: Sami Tolvanen Signed-off-by: Mike Snitzer (cherry picked from commit bb4d73ac5e4f0a6c4853f35824f6cb2d396a2f9c) --- drivers/md/dm-verity-target.c | 72 ++++++++++++++++++++++++----------- drivers/md/dm-verity.h | 6 +++ 2 files changed, 55 insertions(+), 23 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 7e200ba631fb..2b0ee52d1ad8 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -333,19 +333,61 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, return 0; } +/* + * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec + * starting from iter. + */ +int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, + int (*process)(struct dm_verity *v, + struct dm_verity_io *io, u8 *data, + size_t len)) +{ + unsigned todo = 1 << v->data_dev_block_bits; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); + + do { + int r; + u8 *page; + unsigned len; + struct bio_vec bv = bio_iter_iovec(bio, *iter); + + page = kmap_atomic(bv.bv_page); + len = bv.bv_len; + + if (likely(len >= todo)) + len = todo; + + r = process(v, io, page + bv.bv_offset, len); + kunmap_atomic(page); + + if (r < 0) + return r; + + bio_advance_iter(bio, iter, len); + todo -= len; + } while (todo); + + return 0; +} + +static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io, + u8 *data, size_t len) +{ + return verity_hash_update(v, verity_io_hash_desc(v, io), data, len); +} + /* * Verify one "dm_verity_io" structure. */ static int verity_verify_io(struct dm_verity_io *io) { struct dm_verity *v = io->v; - struct bio *bio = dm_bio_from_per_bio_data(io, - v->ti->per_bio_data_size); + struct bvec_iter start; unsigned b; for (b = 0; b < io->n_blocks; b++) { int r; - unsigned todo; struct shash_desc *desc = verity_io_hash_desc(v, io); r = verity_hash_for_block(v, io, io->block + b, @@ -357,26 +399,10 @@ static int verity_verify_io(struct dm_verity_io *io) if (unlikely(r < 0)) return r; - todo = 1 << v->data_dev_block_bits; - do { - u8 *page; - unsigned len; - struct bio_vec bv = bio_iter_iovec(bio, io->iter); - - page = kmap_atomic(bv.bv_page); - len = bv.bv_len; - if (likely(len >= todo)) - len = todo; - r = verity_hash_update(v, desc, page + bv.bv_offset, - len); - kunmap_atomic(page); - - if (unlikely(r < 0)) - return r; - - bio_advance_iter(bio, &io->iter, len); - todo -= len; - } while (todo); + start = io->iter; + r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update); + if (unlikely(r < 0)) + return r; r = verity_hash_final(v, desc, verity_io_real_digest(v, io)); if (unlikely(r < 0)) diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index c7ad4fd05188..f5af52df8e38 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -103,6 +103,12 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v, return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; } +extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, + int (*process)(struct dm_verity *v, + struct dm_verity_io *io, + u8 *data, size_t len)); + extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, const u8 *data, size_t len, u8 *digest); From 83c1827e5887c9288e4ecb7962c6dc63596ad922 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 3 Dec 2015 14:26:30 +0000 Subject: [PATCH 0008/3220] UPSTREAM: dm verity: add support for forward error correction Add support for correcting corrupted blocks using Reed-Solomon. This code uses RS(255, N) interleaved across data and hash blocks. Each error-correcting block covers N bytes evenly distributed across the combined total data, so that each byte is a maximum distance away from the others. This makes it possible to recover from several consecutive corrupted blocks with relatively small space overhead. In addition, using verity hashes to locate erasures nearly doubles the effectiveness of error correction. Being able to detect corrupted blocks also improves performance, because only corrupted blocks need to corrected. For a 2 GiB partition, RS(255, 253) (two parity bytes for each 253-byte block) can correct up to 16 MiB of consecutive corrupted blocks if erasures can be located, and 8 MiB if they cannot, with 16 MiB space overhead. Change-Id: Ife4f8889f7fbf0974bf3ed4be6d3322ae9b4cb0e Signed-off-by: Sami Tolvanen Signed-off-by: Mike Snitzer (cherry picked from commit a739ff3f543afbb4a041c16cd0182c8e8d366e70) --- Documentation/device-mapper/verity.txt | 35 +- drivers/md/Kconfig | 12 + drivers/md/Makefile | 4 + drivers/md/dm-verity-fec.c | 812 +++++++++++++++++++++++++ drivers/md/dm-verity-fec.h | 152 +++++ drivers/md/dm-verity-target.c | 55 +- drivers/md/dm-verity.h | 10 + 7 files changed, 1071 insertions(+), 9 deletions(-) create mode 100644 drivers/md/dm-verity-fec.c create mode 100644 drivers/md/dm-verity-fec.h diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt index e15bc1a0fb98..d602c801ff59 100644 --- a/Documentation/device-mapper/verity.txt +++ b/Documentation/device-mapper/verity.txt @@ -18,11 +18,11 @@ Construction Parameters 0 is the original format used in the Chromium OS. The salt is appended when hashing, digests are stored continuously and - the rest of the block is padded with zeros. + the rest of the block is padded with zeroes. 1 is the current format that should be used for new devices. The salt is prepended when hashing and each digest is - padded with zeros to the power of two. + padded with zeroes to the power of two. This is the device containing data, the integrity of which needs to be @@ -79,6 +79,32 @@ restart_on_corruption not compatible with ignore_corruption and requires user space support to avoid restart loops. +use_fec_from_device + Use forward error correction (FEC) to recover from corruption if hash + verification fails. Use encoding data from the specified device. This + may be the same device where data and hash blocks reside, in which case + fec_start must be outside data and hash areas. + + If the encoding data covers additional metadata, it must be accessible + on the hash device after the hash blocks. + + Note: block sizes for data and hash devices must match. Also, if the + verity is encrypted the should be too. + +fec_roots + Number of generator roots. This equals to the number of parity bytes in + the encoding data. For example, in RS(M, N) encoding, the number of roots + is M-N. + +fec_blocks + The number of encoding data blocks on the FEC device. The block size for + the FEC device is . + +fec_start + This is the offset, in blocks, from the start of the + FEC device to the beginning of the encoding data. + + Theory of operation =================== @@ -98,6 +124,11 @@ per-block basis. This allows for a lightweight hash computation on first read into the page cache. Block hashes are stored linearly, aligned to the nearest block size. +If forward error correction (FEC) support is enabled any recovery of +corrupted data will be verified using the cryptographic hash of the +corresponding data. This is why combining error correction with +integrity checking is essential. + Hash Tree --------- diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 7913fdcfc849..d8b0ab6f3753 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -458,6 +458,18 @@ config DM_VERITY If unsure, say N. +config DM_VERITY_FEC + bool "Verity forward error correction support" + depends on DM_VERITY + select REED_SOLOMON + select REED_SOLOMON_DEC8 + ---help--- + Add forward error correction support to dm-verity. This option + makes it possible to use pre-generated error correction data to + recover from corrupted blocks. + + If unsure, say N. + config DM_SWITCH tristate "Switch target support (EXPERIMENTAL)" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 94e9f6bb33d1..62a65764e8e0 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -64,3 +64,7 @@ obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o endif + +ifeq ($(CONFIG_DM_VERITY_FEC),y) +dm-verity-objs += dm-verity-fec.o +endif diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c new file mode 100644 index 000000000000..88143d36a1d2 --- /dev/null +++ b/drivers/md/dm-verity-fec.c @@ -0,0 +1,812 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * Author: Sami Tolvanen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include "dm-verity-fec.h" +#include + +#define DM_MSG_PREFIX "verity-fec" + +/* + * If error correction has been configured, returns true. + */ +bool verity_fec_is_enabled(struct dm_verity *v) +{ + return v->fec && v->fec->dev; +} + +/* + * Return a pointer to dm_verity_fec_io after dm_verity_io and its variable + * length fields. + */ +static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io) +{ + return (struct dm_verity_fec_io *) verity_io_digest_end(io->v, io); +} + +/* + * Return an interleaved offset for a byte in RS block. + */ +static inline u64 fec_interleave(struct dm_verity *v, u64 offset) +{ + u32 mod; + + mod = do_div(offset, v->fec->rsn); + return offset + mod * (v->fec->rounds << v->data_dev_block_bits); +} + +/* + * Decode an RS block using Reed-Solomon. + */ +static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, + u8 *data, u8 *fec, int neras) +{ + int i; + uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN]; + + for (i = 0; i < v->fec->roots; i++) + par[i] = fec[i]; + + return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras, + fio->erasures, 0, NULL); +} + +/* + * Read error-correcting codes for the requested RS block. Returns a pointer + * to the data block. Caller is responsible for releasing buf. + */ +static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, + unsigned *offset, struct dm_buffer **buf) +{ + u64 position, block; + u8 *res; + + position = (index + rsb) * v->fec->roots; + block = position >> v->data_dev_block_bits; + *offset = (unsigned)(position - (block << v->data_dev_block_bits)); + + res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf); + if (unlikely(IS_ERR(res))) { + DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", + v->data_dev->name, (unsigned long long)rsb, + (unsigned long long)(v->fec->start + block), + PTR_ERR(res)); + *buf = NULL; + } + + return res; +} + +/* Loop over each preallocated buffer slot. */ +#define fec_for_each_prealloc_buffer(__i) \ + for (__i = 0; __i < DM_VERITY_FEC_BUF_PREALLOC; __i++) + +/* Loop over each extra buffer slot. */ +#define fec_for_each_extra_buffer(io, __i) \ + for (__i = DM_VERITY_FEC_BUF_PREALLOC; __i < DM_VERITY_FEC_BUF_MAX; __i++) + +/* Loop over each allocated buffer. */ +#define fec_for_each_buffer(io, __i) \ + for (__i = 0; __i < (io)->nbufs; __i++) + +/* Loop over each RS block in each allocated buffer. */ +#define fec_for_each_buffer_rs_block(io, __i, __j) \ + fec_for_each_buffer(io, __i) \ + for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++) + +/* + * Return a pointer to the current RS block when called inside + * fec_for_each_buffer_rs_block. + */ +static inline u8 *fec_buffer_rs_block(struct dm_verity *v, + struct dm_verity_fec_io *fio, + unsigned i, unsigned j) +{ + return &fio->bufs[i][j * v->fec->rsn]; +} + +/* + * Return an index to the current RS block when called inside + * fec_for_each_buffer_rs_block. + */ +static inline unsigned fec_buffer_rs_index(unsigned i, unsigned j) +{ + return (i << DM_VERITY_FEC_BUF_RS_BITS) + j; +} + +/* + * Decode all RS blocks from buffers and copy corrected bytes into fio->output + * starting from block_offset. + */ +static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, + u64 rsb, int byte_index, unsigned block_offset, + int neras) +{ + int r, corrected = 0, res; + struct dm_buffer *buf; + unsigned n, i, offset; + u8 *par, *block; + + par = fec_read_parity(v, rsb, block_offset, &offset, &buf); + if (IS_ERR(par)) + return PTR_ERR(par); + + /* + * Decode the RS blocks we have in bufs. Each RS block results in + * one corrected target byte and consumes fec->roots parity bytes. + */ + fec_for_each_buffer_rs_block(fio, n, i) { + block = fec_buffer_rs_block(v, fio, n, i); + res = fec_decode_rs8(v, fio, block, &par[offset], neras); + if (res < 0) { + dm_bufio_release(buf); + + r = res; + goto error; + } + + corrected += res; + fio->output[block_offset] = block[byte_index]; + + block_offset++; + if (block_offset >= 1 << v->data_dev_block_bits) + goto done; + + /* read the next block when we run out of parity bytes */ + offset += v->fec->roots; + if (offset >= 1 << v->data_dev_block_bits) { + dm_bufio_release(buf); + + par = fec_read_parity(v, rsb, block_offset, &offset, &buf); + if (unlikely(IS_ERR(par))) + return PTR_ERR(par); + } + } +done: + r = corrected; +error: + if (r < 0 && neras) + DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", + v->data_dev->name, (unsigned long long)rsb, r); + else if (r > 0) + DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", + v->data_dev->name, (unsigned long long)rsb, r); + + return r; +} + +/* + * Locate data block erasures using verity hashes. + */ +static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, + u8 *want_digest, u8 *data) +{ + if (unlikely(verity_hash(v, verity_io_hash_desc(v, io), + data, 1 << v->data_dev_block_bits, + verity_io_real_digest(v, io)))) + return 0; + + return memcmp(verity_io_real_digest(v, io), want_digest, + v->digest_size) != 0; +} + +/* + * Read data blocks that are part of the RS block and deinterleave as much as + * fits into buffers. Check for erasure locations if @neras is non-NULL. + */ +static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, + u64 rsb, u64 target, unsigned block_offset, + int *neras) +{ + int i, j, target_index = -1; + struct dm_buffer *buf; + struct dm_bufio_client *bufio; + struct dm_verity_fec_io *fio = fec_io(io); + u64 block, ileaved; + u8 *bbuf, *rs_block; + u8 want_digest[v->digest_size]; + unsigned n, k; + + if (neras) + *neras = 0; + + /* + * read each of the rsn data blocks that are part of the RS block, and + * interleave contents to available bufs + */ + for (i = 0; i < v->fec->rsn; i++) { + ileaved = fec_interleave(v, rsb * v->fec->rsn + i); + + /* + * target is the data block we want to correct, target_index is + * the index of this block within the rsn RS blocks + */ + if (ileaved == target) + target_index = i; + + block = ileaved >> v->data_dev_block_bits; + bufio = v->fec->data_bufio; + + if (block >= v->data_blocks) { + block -= v->data_blocks; + + /* + * blocks outside the area were assumed to contain + * zeros when encoding data was generated + */ + if (unlikely(block >= v->fec->hash_blocks)) + continue; + + block += v->hash_start; + bufio = v->bufio; + } + + bbuf = dm_bufio_read(bufio, block, &buf); + if (unlikely(IS_ERR(bbuf))) { + DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", + v->data_dev->name, + (unsigned long long)rsb, + (unsigned long long)block, PTR_ERR(bbuf)); + + /* assume the block is corrupted */ + if (neras && *neras <= v->fec->roots) + fio->erasures[(*neras)++] = i; + + continue; + } + + /* locate erasures if the block is on the data device */ + if (bufio == v->fec->data_bufio && + verity_hash_for_block(v, io, block, want_digest) == 0) { + /* + * skip if we have already found the theoretical + * maximum number (i.e. fec->roots) of erasures + */ + if (neras && *neras <= v->fec->roots && + fec_is_erasure(v, io, want_digest, bbuf)) + fio->erasures[(*neras)++] = i; + } + + /* + * deinterleave and copy the bytes that fit into bufs, + * starting from block_offset + */ + fec_for_each_buffer_rs_block(fio, n, j) { + k = fec_buffer_rs_index(n, j) + block_offset; + + if (k >= 1 << v->data_dev_block_bits) + goto done; + + rs_block = fec_buffer_rs_block(v, fio, n, j); + rs_block[i] = bbuf[k]; + } +done: + dm_bufio_release(buf); + } + + return target_index; +} + +/* + * Allocate RS control structure and FEC buffers from preallocated mempools, + * and attempt to allocate as many extra buffers as available. + */ +static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) +{ + unsigned n; + + if (!fio->rs) { + fio->rs = mempool_alloc(v->fec->rs_pool, 0); + if (unlikely(!fio->rs)) { + DMERR("failed to allocate RS"); + return -ENOMEM; + } + } + + fec_for_each_prealloc_buffer(n) { + if (fio->bufs[n]) + continue; + + fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO); + if (unlikely(!fio->bufs[n])) { + DMERR("failed to allocate FEC buffer"); + return -ENOMEM; + } + } + + /* try to allocate the maximum number of buffers */ + fec_for_each_extra_buffer(fio, n) { + if (fio->bufs[n]) + continue; + + fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO); + /* we can manage with even one buffer if necessary */ + if (unlikely(!fio->bufs[n])) + break; + } + fio->nbufs = n; + + if (!fio->output) { + fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO); + + if (!fio->output) { + DMERR("failed to allocate FEC page"); + return -ENOMEM; + } + } + + return 0; +} + +/* + * Initialize buffers and clear erasures. fec_read_bufs() assumes buffers are + * zeroed before deinterleaving. + */ +static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) +{ + unsigned n; + + fec_for_each_buffer(fio, n) + memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS); + + memset(fio->erasures, 0, sizeof(fio->erasures)); +} + +/* + * Decode all RS blocks in a single data block and return the target block + * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses + * hashes to locate erasures. + */ +static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, + struct dm_verity_fec_io *fio, u64 rsb, u64 offset, + bool use_erasures) +{ + int r, neras = 0; + unsigned pos; + + r = fec_alloc_bufs(v, fio); + if (unlikely(r < 0)) + return r; + + for (pos = 0; pos < 1 << v->data_dev_block_bits; ) { + fec_init_bufs(v, fio); + + r = fec_read_bufs(v, io, rsb, offset, pos, + use_erasures ? &neras : NULL); + if (unlikely(r < 0)) + return r; + + r = fec_decode_bufs(v, fio, rsb, r, pos, neras); + if (r < 0) + return r; + + pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS; + } + + /* Always re-validate the corrected block against the expected hash */ + r = verity_hash(v, verity_io_hash_desc(v, io), fio->output, + 1 << v->data_dev_block_bits, + verity_io_real_digest(v, io)); + if (unlikely(r < 0)) + return r; + + if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io), + v->digest_size)) { + DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)", + v->data_dev->name, (unsigned long long)rsb, neras); + return -EILSEQ; + } + + return 0; +} + +static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data, + size_t len) +{ + struct dm_verity_fec_io *fio = fec_io(io); + + memcpy(data, &fio->output[fio->output_pos], len); + fio->output_pos += len; + + return 0; +} + +/* + * Correct errors in a block. Copies corrected block to dest if non-NULL, + * otherwise to a bio_vec starting from iter. + */ +int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, + enum verity_block_type type, sector_t block, u8 *dest, + struct bvec_iter *iter) +{ + int r; + struct dm_verity_fec_io *fio = fec_io(io); + u64 offset, res, rsb; + + if (!verity_fec_is_enabled(v)) + return -EOPNOTSUPP; + + if (type == DM_VERITY_BLOCK_TYPE_METADATA) + block += v->data_blocks; + + /* + * For RS(M, N), the continuous FEC data is divided into blocks of N + * bytes. Since block size may not be divisible by N, the last block + * is zero padded when decoding. + * + * Each byte of the block is covered by a different RS(M, N) code, + * and each code is interleaved over N blocks to make it less likely + * that bursty corruption will leave us in unrecoverable state. + */ + + offset = block << v->data_dev_block_bits; + + res = offset; + div64_u64(res, v->fec->rounds << v->data_dev_block_bits); + + /* + * The base RS block we can feed to the interleaver to find out all + * blocks required for decoding. + */ + rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits); + + /* + * Locating erasures is slow, so attempt to recover the block without + * them first. Do a second attempt with erasures if the corruption is + * bad enough. + */ + r = fec_decode_rsb(v, io, fio, rsb, offset, false); + if (r < 0) { + r = fec_decode_rsb(v, io, fio, rsb, offset, true); + if (r < 0) + return r; + } + + if (dest) + memcpy(dest, fio->output, 1 << v->data_dev_block_bits); + else if (iter) { + fio->output_pos = 0; + r = verity_for_bv_block(v, io, iter, fec_bv_copy); + } + + return r; +} + +/* + * Clean up per-bio data. + */ +void verity_fec_finish_io(struct dm_verity_io *io) +{ + unsigned n; + struct dm_verity_fec *f = io->v->fec; + struct dm_verity_fec_io *fio = fec_io(io); + + if (!verity_fec_is_enabled(io->v)) + return; + + mempool_free(fio->rs, f->rs_pool); + + fec_for_each_prealloc_buffer(n) + mempool_free(fio->bufs[n], f->prealloc_pool); + + fec_for_each_extra_buffer(fio, n) + mempool_free(fio->bufs[n], f->extra_pool); + + mempool_free(fio->output, f->output_pool); +} + +/* + * Initialize per-bio data. + */ +void verity_fec_init_io(struct dm_verity_io *io) +{ + struct dm_verity_fec_io *fio = fec_io(io); + + if (!verity_fec_is_enabled(io->v)) + return; + + fio->rs = NULL; + memset(fio->bufs, 0, sizeof(fio->bufs)); + fio->nbufs = 0; + fio->output = NULL; +} + +/* + * Append feature arguments and values to the status table. + */ +unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz, + char *result, unsigned maxlen) +{ + if (!verity_fec_is_enabled(v)) + return sz; + + DMEMIT(" " DM_VERITY_OPT_FEC_DEV " %s " + DM_VERITY_OPT_FEC_BLOCKS " %llu " + DM_VERITY_OPT_FEC_START " %llu " + DM_VERITY_OPT_FEC_ROOTS " %d", + v->fec->dev->name, + (unsigned long long)v->fec->blocks, + (unsigned long long)v->fec->start, + v->fec->roots); + + return sz; +} + +void verity_fec_dtr(struct dm_verity *v) +{ + struct dm_verity_fec *f = v->fec; + + if (!verity_fec_is_enabled(v)) + goto out; + + mempool_destroy(f->rs_pool); + mempool_destroy(f->prealloc_pool); + mempool_destroy(f->extra_pool); + kmem_cache_destroy(f->cache); + + if (f->data_bufio) + dm_bufio_client_destroy(f->data_bufio); + if (f->bufio) + dm_bufio_client_destroy(f->bufio); + + if (f->dev) + dm_put_device(v->ti, f->dev); +out: + kfree(f); + v->fec = NULL; +} + +static void *fec_rs_alloc(gfp_t gfp_mask, void *pool_data) +{ + struct dm_verity *v = (struct dm_verity *)pool_data; + + return init_rs(8, 0x11d, 0, 1, v->fec->roots); +} + +static void fec_rs_free(void *element, void *pool_data) +{ + struct rs_control *rs = (struct rs_control *)element; + + if (rs) + free_rs(rs); +} + +bool verity_is_fec_opt_arg(const char *arg_name) +{ + return (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_START) || + !strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)); +} + +int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, + unsigned *argc, const char *arg_name) +{ + int r; + struct dm_target *ti = v->ti; + const char *arg_value; + unsigned long long num_ll; + unsigned char num_c; + char dummy; + + if (!*argc) { + ti->error = "FEC feature arguments require a value"; + return -EINVAL; + } + + arg_value = dm_shift_arg(as); + (*argc)--; + + if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { + r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev); + if (r) { + ti->error = "FEC device lookup failed"; + return r; + } + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS)) { + if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 || + ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) + >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + v->fec->blocks = num_ll; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_START)) { + if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 || + ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) >> + (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_START; + return -EINVAL; + } + v->fec->start = num_ll; + + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) { + if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c || + num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) || + num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS; + return -EINVAL; + } + v->fec->roots = num_c; + + } else { + ti->error = "Unrecognized verity FEC feature request"; + return -EINVAL; + } + + return 0; +} + +/* + * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr. + */ +int verity_fec_ctr_alloc(struct dm_verity *v) +{ + struct dm_verity_fec *f; + + f = kzalloc(sizeof(struct dm_verity_fec), GFP_KERNEL); + if (!f) { + v->ti->error = "Cannot allocate FEC structure"; + return -ENOMEM; + } + v->fec = f; + + return 0; +} + +/* + * Validate arguments and preallocate memory. Must be called after arguments + * have been parsed using verity_fec_parse_opt_args. + */ +int verity_fec_ctr(struct dm_verity *v) +{ + struct dm_verity_fec *f = v->fec; + struct dm_target *ti = v->ti; + u64 hash_blocks; + + if (!verity_fec_is_enabled(v)) { + verity_fec_dtr(v); + return 0; + } + + /* + * FEC is computed over data blocks, possible metadata, and + * hash blocks. In other words, FEC covers total of fec_blocks + * blocks consisting of the following: + * + * data blocks | hash blocks | metadata (optional) + * + * We allow metadata after hash blocks to support a use case + * where all data is stored on the same device and FEC covers + * the entire area. + * + * If metadata is included, we require it to be available on the + * hash device after the hash blocks. + */ + + hash_blocks = v->hash_blocks - v->hash_start; + + /* + * Require matching block sizes for data and hash devices for + * simplicity. + */ + if (v->data_dev_block_bits != v->hash_dev_block_bits) { + ti->error = "Block sizes must match to use FEC"; + return -EINVAL; + } + + if (!f->roots) { + ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS; + return -EINVAL; + } + f->rsn = DM_VERITY_FEC_RSM - f->roots; + + if (!f->blocks) { + ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + + f->rounds = f->blocks; + if (sector_div(f->rounds, f->rsn)) + f->rounds++; + + /* + * Due to optional metadata, f->blocks can be larger than + * data_blocks and hash_blocks combined. + */ + if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) { + ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS; + return -EINVAL; + } + + /* + * Metadata is accessed through the hash device, so we require + * it to be large enough. + */ + f->hash_blocks = f->blocks - v->data_blocks; + if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) { + ti->error = "Hash device is too small for " + DM_VERITY_OPT_FEC_BLOCKS; + return -E2BIG; + } + + f->bufio = dm_bufio_client_create(f->dev->bdev, + 1 << v->data_dev_block_bits, + 1, 0, NULL, NULL); + if (IS_ERR(f->bufio)) { + ti->error = "Cannot initialize FEC bufio client"; + return PTR_ERR(f->bufio); + } + + if (dm_bufio_get_device_size(f->bufio) < + ((f->start + f->rounds * f->roots) >> v->data_dev_block_bits)) { + ti->error = "FEC device is too small"; + return -E2BIG; + } + + f->data_bufio = dm_bufio_client_create(v->data_dev->bdev, + 1 << v->data_dev_block_bits, + 1, 0, NULL, NULL); + if (IS_ERR(f->data_bufio)) { + ti->error = "Cannot initialize FEC data bufio client"; + return PTR_ERR(f->data_bufio); + } + + if (dm_bufio_get_device_size(f->data_bufio) < v->data_blocks) { + ti->error = "Data device is too small"; + return -E2BIG; + } + + /* Preallocate an rs_control structure for each worker thread */ + f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc, + fec_rs_free, (void *) v); + if (!f->rs_pool) { + ti->error = "Cannot allocate RS pool"; + return -ENOMEM; + } + + f->cache = kmem_cache_create("dm_verity_fec_buffers", + f->rsn << DM_VERITY_FEC_BUF_RS_BITS, + 0, 0, NULL); + if (!f->cache) { + ti->error = "Cannot create FEC buffer cache"; + return -ENOMEM; + } + + /* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */ + f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() * + DM_VERITY_FEC_BUF_PREALLOC, + f->cache); + if (!f->prealloc_pool) { + ti->error = "Cannot allocate FEC buffer prealloc pool"; + return -ENOMEM; + } + + f->extra_pool = mempool_create_slab_pool(0, f->cache); + if (!f->extra_pool) { + ti->error = "Cannot allocate FEC buffer extra pool"; + return -ENOMEM; + } + + /* Preallocate an output buffer for each thread */ + f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(), + 1 << v->data_dev_block_bits); + if (!f->output_pool) { + ti->error = "Cannot allocate FEC output pool"; + return -ENOMEM; + } + + /* Reserve space for our per-bio data */ + ti->per_bio_data_size += sizeof(struct dm_verity_fec_io); + + return 0; +} diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h new file mode 100644 index 000000000000..7fa0298b995e --- /dev/null +++ b/drivers/md/dm-verity-fec.h @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * Author: Sami Tolvanen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#ifndef DM_VERITY_FEC_H +#define DM_VERITY_FEC_H + +#include "dm-verity.h" +#include + +/* Reed-Solomon(M, N) parameters */ +#define DM_VERITY_FEC_RSM 255 +#define DM_VERITY_FEC_MAX_RSN 253 +#define DM_VERITY_FEC_MIN_RSN 231 /* ~10% space overhead */ + +/* buffers for deinterleaving and decoding */ +#define DM_VERITY_FEC_BUF_PREALLOC 1 /* buffers to preallocate */ +#define DM_VERITY_FEC_BUF_RS_BITS 4 /* 1 << RS blocks per buffer */ +/* we need buffers for at most 1 << block size RS blocks */ +#define DM_VERITY_FEC_BUF_MAX \ + (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) + +#define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" +#define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" +#define DM_VERITY_OPT_FEC_START "fec_start" +#define DM_VERITY_OPT_FEC_ROOTS "fec_roots" + +/* configuration */ +struct dm_verity_fec { + struct dm_dev *dev; /* parity data device */ + struct dm_bufio_client *data_bufio; /* for data dev access */ + struct dm_bufio_client *bufio; /* for parity data access */ + sector_t start; /* parity data start in blocks */ + sector_t blocks; /* number of blocks covered */ + sector_t rounds; /* number of interleaving rounds */ + sector_t hash_blocks; /* blocks covered after v->hash_start */ + unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */ + unsigned char rsn; /* N of RS(M, N) */ + mempool_t *rs_pool; /* mempool for fio->rs */ + mempool_t *prealloc_pool; /* mempool for preallocated buffers */ + mempool_t *extra_pool; /* mempool for extra buffers */ + mempool_t *output_pool; /* mempool for output */ + struct kmem_cache *cache; /* cache for buffers */ +}; + +/* per-bio data */ +struct dm_verity_fec_io { + struct rs_control *rs; /* Reed-Solomon state */ + int erasures[DM_VERITY_FEC_MAX_RSN]; /* erasures for decode_rs8 */ + u8 *bufs[DM_VERITY_FEC_BUF_MAX]; /* bufs for deinterleaving */ + unsigned nbufs; /* number of buffers allocated */ + u8 *output; /* buffer for corrected output */ + size_t output_pos; +}; + +#ifdef CONFIG_DM_VERITY_FEC + +/* each feature parameter requires a value */ +#define DM_VERITY_OPTS_FEC 8 + +extern bool verity_fec_is_enabled(struct dm_verity *v); + +extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, + enum verity_block_type type, sector_t block, + u8 *dest, struct bvec_iter *iter); + +extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz, + char *result, unsigned maxlen); + +extern void verity_fec_finish_io(struct dm_verity_io *io); +extern void verity_fec_init_io(struct dm_verity_io *io); + +extern bool verity_is_fec_opt_arg(const char *arg_name); +extern int verity_fec_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, unsigned *argc, + const char *arg_name); + +extern void verity_fec_dtr(struct dm_verity *v); + +extern int verity_fec_ctr_alloc(struct dm_verity *v); +extern int verity_fec_ctr(struct dm_verity *v); + +#else /* !CONFIG_DM_VERITY_FEC */ + +#define DM_VERITY_OPTS_FEC 0 + +static inline bool verity_fec_is_enabled(struct dm_verity *v) +{ + return false; +} + +static inline int verity_fec_decode(struct dm_verity *v, + struct dm_verity_io *io, + enum verity_block_type type, + sector_t block, u8 *dest, + struct bvec_iter *iter) +{ + return -EOPNOTSUPP; +} + +static inline unsigned verity_fec_status_table(struct dm_verity *v, + unsigned sz, char *result, + unsigned maxlen) +{ + return sz; +} + +static inline void verity_fec_finish_io(struct dm_verity_io *io) +{ +} + +static inline void verity_fec_init_io(struct dm_verity_io *io) +{ +} + +static inline bool verity_is_fec_opt_arg(const char *arg_name) +{ + return false; +} + +static inline int verity_fec_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, + unsigned *argc, + const char *arg_name) +{ + return -EINVAL; +} + +static inline void verity_fec_dtr(struct dm_verity *v) +{ +} + +static inline int verity_fec_ctr_alloc(struct dm_verity *v) +{ + return 0; +} + +static inline int verity_fec_ctr(struct dm_verity *v) +{ + return 0; +} + +#endif /* CONFIG_DM_VERITY_FEC */ + +#endif /* DM_VERITY_FEC_H */ diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 2b0ee52d1ad8..4f90ec2c6b7a 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -15,6 +15,7 @@ */ #include "dm-verity.h" +#include "dm-verity-fec.h" #include #include @@ -31,7 +32,7 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" -#define DM_VERITY_OPTS_MAX 1 +#define DM_VERITY_OPTS_MAX (1 + DM_VERITY_OPTS_FEC) static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; @@ -282,6 +283,10 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, if (likely(memcmp(verity_io_real_digest(v, io), want_digest, v->digest_size) == 0)) aux->hash_verified = 1; + else if (verity_fec_decode(v, io, + DM_VERITY_BLOCK_TYPE_METADATA, + hash_block, data, NULL) == 0) + aux->hash_verified = 1; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA, hash_block)) { @@ -411,8 +416,11 @@ static int verity_verify_io(struct dm_verity_io *io) if (likely(memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io), v->digest_size) == 0)) continue; + else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, + io->block + b, NULL, &start) == 0) + continue; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - io->block + b)) + io->block + b)) return -EIO; } @@ -430,6 +438,8 @@ static void verity_finish_io(struct dm_verity_io *io, int error) bio->bi_end_io = io->orig_bi_end_io; bio->bi_error = error; + verity_fec_finish_io(io); + bio_endio(bio); } @@ -444,7 +454,7 @@ static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_error && !verity_fec_is_enabled(io->v)) { verity_finish_io(io, bio->bi_error); return; } @@ -547,6 +557,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio) bio->bi_private = io; io->iter = bio->bi_iter; + verity_fec_init_io(io); + verity_submit_prefetch(v, io); generic_make_request(bio); @@ -561,6 +573,7 @@ static void verity_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct dm_verity *v = ti->private; + unsigned args = 0; unsigned sz = 0; unsigned x; @@ -587,8 +600,15 @@ static void verity_status(struct dm_target *ti, status_type_t type, else for (x = 0; x < v->salt_size; x++) DMEMIT("%02x", v->salt[x]); + if (v->mode != DM_VERITY_MODE_EIO) + args++; + if (verity_fec_is_enabled(v)) + args += DM_VERITY_OPTS_FEC; + if (!args) + return; + DMEMIT(" %u", args); if (v->mode != DM_VERITY_MODE_EIO) { - DMEMIT(" 1 "); + DMEMIT(" "); switch (v->mode) { case DM_VERITY_MODE_LOGGING: DMEMIT(DM_VERITY_OPT_LOGGING); @@ -600,6 +620,7 @@ static void verity_status(struct dm_target *ti, status_type_t type, BUG(); } } + sz = verity_fec_status_table(v, sz, result, maxlen); break; } } @@ -662,6 +683,8 @@ static void verity_dtr(struct dm_target *ti) if (v->data_dev) dm_put_device(ti, v->data_dev); + verity_fec_dtr(v); + kfree(v); } @@ -694,6 +717,12 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) } else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) { v->mode = DM_VERITY_MODE_RESTART; continue; + + } else if (verity_is_fec_opt_arg(arg_name)) { + r = verity_fec_parse_opt_args(as, v, &argc, arg_name); + if (r) + return r; + continue; } ti->error = "Unrecognized verity feature request"; @@ -736,6 +765,10 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->private = v; v->ti = ti; + r = verity_fec_ctr_alloc(v); + if (r) + goto bad; + if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { ti->error = "Device must be readonly"; r = -EINVAL; @@ -924,8 +957,6 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io)); - /* WQ_UNBOUND greatly improves performance when running on ramdisk */ v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); if (!v->verify_wq) { @@ -934,6 +965,16 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + ti->per_bio_data_size = sizeof(struct dm_verity_io) + + v->shash_descsize + v->digest_size * 2; + + r = verity_fec_ctr(v); + if (r) + goto bad; + + ti->per_bio_data_size = roundup(ti->per_bio_data_size, + __alignof__(struct dm_verity_io)); + return 0; bad: @@ -944,7 +985,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index f5af52df8e38..8e853722f6c6 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -29,6 +29,8 @@ enum verity_block_type { DM_VERITY_BLOCK_TYPE_METADATA }; +struct dm_verity_fec; + struct dm_verity { struct dm_dev *data_dev; struct dm_dev *hash_dev; @@ -58,6 +60,8 @@ struct dm_verity { /* starting blocks for each tree level. 0 is the lowest level. */ sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; + + struct dm_verity_fec *fec; /* forward error correction */ }; struct dm_verity_io { @@ -103,6 +107,12 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v, return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; } +static inline u8 *verity_io_digest_end(struct dm_verity *v, + struct dm_verity_io *io) +{ + return verity_io_want_digest(v, io) + v->digest_size; +} + extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, struct bvec_iter *iter, int (*process)(struct dm_verity *v, From f75998fe8443541f599455aba93871be1e124b9a Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 3 Dec 2015 14:26:31 +0000 Subject: [PATCH 0009/3220] UPSTREAM: dm verity: add ignore_zero_blocks feature If ignore_zero_blocks is enabled dm-verity will return zeroes for blocks matching a zero hash without validating the content. Change-Id: I728fa4b2586b29f2793ea5cb014289892819d249 Signed-off-by: Sami Tolvanen Signed-off-by: Mike Snitzer (cherry picked from commit 0cc37c2df4fa0aa702f9662edce4b7ce12c86b7a) --- Documentation/device-mapper/verity.txt | 5 ++ drivers/md/dm-verity-fec.c | 8 ++- drivers/md/dm-verity-target.c | 87 +++++++++++++++++++++++--- drivers/md/dm-verity.h | 3 +- 4 files changed, 93 insertions(+), 10 deletions(-) diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt index d602c801ff59..89fd8f9a259f 100644 --- a/Documentation/device-mapper/verity.txt +++ b/Documentation/device-mapper/verity.txt @@ -79,6 +79,11 @@ restart_on_corruption not compatible with ignore_corruption and requires user space support to avoid restart loops. +ignore_zero_blocks + Do not verify blocks that are expected to contain zeroes and always return + zeroes instead. This may be useful if the partition contains unused blocks + that are not guaranteed to contain zeroes. + use_fec_from_device Use forward error correction (FEC) to recover from corruption if hash verification fails. Use encoding data from the specified device. This diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 88143d36a1d2..1cc10c4de701 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -205,6 +205,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, u64 rsb, u64 target, unsigned block_offset, int *neras) { + bool is_zero; int i, j, target_index = -1; struct dm_buffer *buf; struct dm_bufio_client *bufio; @@ -264,7 +265,12 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, /* locate erasures if the block is on the data device */ if (bufio == v->fec->data_bufio && - verity_hash_for_block(v, io, block, want_digest) == 0) { + verity_hash_for_block(v, io, block, want_digest, + &is_zero) == 0) { + /* skip known zero blocks entirely */ + if (is_zero) + continue; + /* * skip if we have already found the theoretical * maximum number (i.e. fec->roots) of erasures diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 4f90ec2c6b7a..5c5d30cb6ec5 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -31,8 +31,9 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" +#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" -#define DM_VERITY_OPTS_MAX (1 + DM_VERITY_OPTS_FEC) +#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC) static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; @@ -309,10 +310,9 @@ release_ret_r: * of the hash tree if necessary. */ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, - sector_t block, u8 *digest) + sector_t block, u8 *digest, bool *is_zero) { - int i; - int r; + int r = 0, i; if (likely(v->levels)) { /* @@ -324,7 +324,7 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, */ r = verity_verify_level(v, io, block, 0, true, digest); if (likely(r <= 0)) - return r; + goto out; } memcpy(digest, v->root_digest, v->digest_size); @@ -332,10 +332,15 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, for (i = v->levels - 1; i >= 0; i--) { r = verity_verify_level(v, io, block, i, false, digest); if (unlikely(r)) - return r; + goto out; } +out: + if (!r && v->zero_digest) + *is_zero = !memcmp(v->zero_digest, digest, v->digest_size); + else + *is_zero = false; - return 0; + return r; } /* @@ -382,11 +387,19 @@ static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io, return verity_hash_update(v, verity_io_hash_desc(v, io), data, len); } +static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, + u8 *data, size_t len) +{ + memset(data, 0, len); + return 0; +} + /* * Verify one "dm_verity_io" structure. */ static int verity_verify_io(struct dm_verity_io *io) { + bool is_zero; struct dm_verity *v = io->v; struct bvec_iter start; unsigned b; @@ -396,10 +409,24 @@ static int verity_verify_io(struct dm_verity_io *io) struct shash_desc *desc = verity_io_hash_desc(v, io); r = verity_hash_for_block(v, io, io->block + b, - verity_io_want_digest(v, io)); + verity_io_want_digest(v, io), + &is_zero); if (unlikely(r < 0)) return r; + if (is_zero) { + /* + * If we expect a zero block, don't validate, just + * return zeros. + */ + r = verity_for_bv_block(v, io, &io->iter, + verity_bv_zero); + if (unlikely(r < 0)) + return r; + + continue; + } + r = verity_hash_init(v, desc); if (unlikely(r < 0)) return r; @@ -604,6 +631,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, args++; if (verity_fec_is_enabled(v)) args += DM_VERITY_OPTS_FEC; + if (v->zero_digest) + args++; if (!args) return; DMEMIT(" %u", args); @@ -620,6 +649,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, BUG(); } } + if (v->zero_digest) + DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES); sz = verity_fec_status_table(v, sz, result, maxlen); break; } @@ -671,6 +702,7 @@ static void verity_dtr(struct dm_target *ti) kfree(v->salt); kfree(v->root_digest); + kfree(v->zero_digest); if (v->tfm) crypto_free_shash(v->tfm); @@ -688,6 +720,37 @@ static void verity_dtr(struct dm_target *ti) kfree(v); } +static int verity_alloc_zero_digest(struct dm_verity *v) +{ + int r = -ENOMEM; + struct shash_desc *desc; + u8 *zero_data; + + v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); + + if (!v->zero_digest) + return r; + + desc = kmalloc(v->shash_descsize, GFP_KERNEL); + + if (!desc) + return r; /* verity_dtr will free zero_digest */ + + zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL); + + if (!zero_data) + goto out; + + r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits, + v->zero_digest); + +out: + kfree(desc); + kfree(zero_data); + + return r; +} + static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) { int r; @@ -718,6 +781,14 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) v->mode = DM_VERITY_MODE_RESTART; continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) { + r = verity_alloc_zero_digest(v); + if (r) { + ti->error = "Cannot allocate zero digest"; + return r; + } + continue; + } else if (verity_is_fec_opt_arg(arg_name)) { r = verity_fec_parse_opt_args(as, v, &argc, arg_name); if (r) diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 8e853722f6c6..fb419f422d73 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -40,6 +40,7 @@ struct dm_verity { struct crypto_shash *tfm; u8 *root_digest; /* digest of the root block */ u8 *salt; /* salt: its size is salt_size */ + u8 *zero_digest; /* digest for a zero block */ unsigned salt_size; sector_t data_start; /* data offset in 512-byte sectors */ sector_t hash_start; /* hash start in blocks */ @@ -123,6 +124,6 @@ extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, const u8 *data, size_t len, u8 *digest); extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, - sector_t block, u8 *digest); + sector_t block, u8 *digest, bool *is_zero); #endif /* DM_VERITY_H */ From fcd614efb6c4dd4c4f543feefa6be7f8e49dcc09 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 16 Dec 2015 16:23:49 +0000 Subject: [PATCH 0010/3220] ANDROID: android: base-cfg: enable CONFIG_DM_VERITY_FEC Bug: 21893453 Change-Id: Idd0dfe4e3e527df2eff2f0d734effc40dce294c7 Signed-off-by: Sami Tolvanen (cherry picked from commit 9408350ed80005174918ce5147490035b2cf451b) --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index a4cffb1840d7..220ddb5304df 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -21,6 +21,7 @@ CONFIG_CGROUP_SCHED=y CONFIG_CP15_BARRIER_EMULATION=y CONFIG_DM_CRYPT=y CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HIGH_RES_TIMERS=y From e7d40d2bf8b2991f051734dd45a95385644f8147 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 30 Mar 2016 14:10:13 -0700 Subject: [PATCH 0011/3220] ANDROID: dm verity fec: add sysfs attribute fec/corrected Add a sysfs entry that allows user space to determine whether dm-verity has come across correctable errors on the underlying block device. Bug: 22655252 Bug: 27928374 Change-Id: I80547a2aa944af2fb9ffde002650482877ade31b Signed-off-by: Sami Tolvanen (cherry picked from commit 7911fad5f0a2cf5afc2215657219a21e6630e001) --- drivers/md/dm-verity-fec.c | 45 +++++++++++++++++++++++++++++++++++++- drivers/md/dm-verity-fec.h | 3 +++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 1cc10c4de701..ad10d6d8ed28 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -11,6 +11,7 @@ #include "dm-verity-fec.h" #include +#include #define DM_MSG_PREFIX "verity-fec" @@ -175,9 +176,11 @@ error: if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", v->data_dev->name, (unsigned long long)rsb, r); - else if (r > 0) + else if (r > 0) { DMWARN_LIMIT("%s: FEC %llu: corrected %d errors", v->data_dev->name, (unsigned long long)rsb, r); + atomic_add_unless(&v->fec->corrected, 1, INT_MAX); + } return r; } @@ -548,6 +551,7 @@ unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz, void verity_fec_dtr(struct dm_verity *v) { struct dm_verity_fec *f = v->fec; + struct kobject *kobj = &f->kobj_holder.kobj; if (!verity_fec_is_enabled(v)) goto out; @@ -564,6 +568,12 @@ void verity_fec_dtr(struct dm_verity *v) if (f->dev) dm_put_device(v->ti, f->dev); + + if (kobj->state_initialized) { + kobject_put(kobj); + wait_for_completion(dm_get_completion_from_kobject(kobj)); + } + out: kfree(f); v->fec = NULL; @@ -652,6 +662,27 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, return 0; } +static ssize_t corrected_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct dm_verity_fec *f = container_of(kobj, struct dm_verity_fec, + kobj_holder.kobj); + + return sprintf(buf, "%d\n", atomic_read(&f->corrected)); +} + +static struct kobj_attribute attr_corrected = __ATTR_RO(corrected); + +static struct attribute *fec_attrs[] = { + &attr_corrected.attr, + NULL +}; + +static struct kobj_type fec_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = fec_attrs +}; + /* * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr. */ @@ -675,8 +706,10 @@ int verity_fec_ctr_alloc(struct dm_verity *v) */ int verity_fec_ctr(struct dm_verity *v) { + int r; struct dm_verity_fec *f = v->fec; struct dm_target *ti = v->ti; + struct mapped_device *md = dm_table_get_md(ti->table); u64 hash_blocks; if (!verity_fec_is_enabled(v)) { @@ -684,6 +717,16 @@ int verity_fec_ctr(struct dm_verity *v) return 0; } + /* Create a kobject and sysfs attributes */ + init_completion(&f->kobj_holder.completion); + + r = kobject_init_and_add(&f->kobj_holder.kobj, &fec_ktype, + &disk_to_dev(dm_disk(md))->kobj, "%s", "fec"); + if (r) { + ti->error = "Cannot create kobject"; + return r; + } + /* * FEC is computed over data blocks, possible metadata, and * hash blocks. In other words, FEC covers total of fec_blocks diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 7fa0298b995e..8c4bee052a73 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -12,6 +12,7 @@ #ifndef DM_VERITY_FEC_H #define DM_VERITY_FEC_H +#include "dm.h" #include "dm-verity.h" #include @@ -48,6 +49,8 @@ struct dm_verity_fec { mempool_t *extra_pool; /* mempool for extra buffers */ mempool_t *output_pool; /* mempool for output */ struct kmem_cache *cache; /* cache for buffers */ + atomic_t corrected; /* corrected errors */ + struct dm_kobject_holder kobj_holder; /* for sysfs attributes */ }; /* per-bio data */ From 2ebb4d97e0c9afeaa45c92f6df2d169d25b6db0b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 4 Feb 2015 13:54:48 -0800 Subject: [PATCH 0012/3220] cpufreq: interactive: fix policy locking cpufreq_interactive_speedchange_task() is running as a separate kernel thread and is calling __cpufreq_driver_target(), which requires callers to hold policy->rwsem for writing to prevent racing with other parts of the kernel trying to adjust the frequency, for example kernel thermal throttling. Let's change the code to take policy->rwsem and while at it refactor the code a bit. This was originally 2 changes reviewed at: https://chromium-review.googlesource.com/246273 https://chromium-review.googlesource.com/256120 Change-Id: Icc2d97c6c1b929acd2ee32e8c81d81fd2af778ab Signed-off-by: Dmitry Torokhov Reviewed-by: Dylan Reid Reviewed-by: Douglas Anderson Signed-off-by: Dmitry Torokhov --- drivers/cpufreq/cpufreq_interactive.c | 99 ++++++++++++++++----------- 1 file changed, 60 insertions(+), 39 deletions(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 0be66df4a6e6..5224e897d460 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -511,6 +511,58 @@ static void cpufreq_interactive_idle_end(void) up_read(&pcpu->enable_sem); } +static void cpufreq_interactive_get_policy_info(struct cpufreq_policy *policy, + unsigned int *pmax_freq, + u64 *phvt, u64 *pfvt) +{ + struct cpufreq_interactive_cpuinfo *pcpu; + unsigned int max_freq = 0; + u64 hvt = ~0ULL, fvt = 0; + unsigned int i; + + for_each_cpu(i, policy->cpus) { + pcpu = &per_cpu(cpuinfo, i); + + fvt = max(fvt, pcpu->loc_floor_val_time); + if (pcpu->target_freq > max_freq) { + max_freq = pcpu->target_freq; + hvt = pcpu->loc_hispeed_val_time; + } else if (pcpu->target_freq == max_freq) { + hvt = min(hvt, pcpu->loc_hispeed_val_time); + } + } + + *pmax_freq = max_freq; + *phvt = hvt; + *pfvt = fvt; +} + +static void cpufreq_interactive_adjust_cpu(unsigned int cpu, + struct cpufreq_policy *policy) +{ + struct cpufreq_interactive_cpuinfo *pcpu; + u64 hvt, fvt; + unsigned int max_freq; + int i; + + cpufreq_interactive_get_policy_info(policy, &max_freq, &hvt, &fvt); + + for_each_cpu(i, policy->cpus) { + pcpu = &per_cpu(cpuinfo, i); + pcpu->pol_floor_val_time = fvt; + } + + if (max_freq != policy->cur) { + __cpufreq_driver_target(policy, max_freq, CPUFREQ_RELATION_H); + for_each_cpu(i, policy->cpus) { + pcpu = &per_cpu(cpuinfo, i); + pcpu->pol_hispeed_val_time = hvt; + } + } + + trace_cpufreq_interactive_setspeed(cpu, max_freq, policy->cur); +} + static int cpufreq_interactive_speedchange_task(void *data) { unsigned int cpu; @@ -539,49 +591,18 @@ static int cpufreq_interactive_speedchange_task(void *data) spin_unlock_irqrestore(&speedchange_cpumask_lock, flags); for_each_cpu(cpu, &tmp_mask) { - unsigned int j; - unsigned int max_freq = 0; - struct cpufreq_interactive_cpuinfo *pjcpu; - u64 hvt = ~0ULL, fvt = 0; - pcpu = &per_cpu(cpuinfo, cpu); - if (!down_read_trylock(&pcpu->enable_sem)) - continue; - if (!pcpu->governor_enabled) { + + down_write(&pcpu->policy->rwsem); + + if (likely(down_read_trylock(&pcpu->enable_sem))) { + if (likely(pcpu->governor_enabled)) + cpufreq_interactive_adjust_cpu(cpu, + pcpu->policy); up_read(&pcpu->enable_sem); - continue; } - for_each_cpu(j, pcpu->policy->cpus) { - pjcpu = &per_cpu(cpuinfo, j); - - fvt = max(fvt, pjcpu->loc_floor_val_time); - if (pjcpu->target_freq > max_freq) { - max_freq = pjcpu->target_freq; - hvt = pjcpu->loc_hispeed_val_time; - } else if (pjcpu->target_freq == max_freq) { - hvt = min(hvt, pjcpu->loc_hispeed_val_time); - } - } - for_each_cpu(j, pcpu->policy->cpus) { - pjcpu = &per_cpu(cpuinfo, j); - pjcpu->pol_floor_val_time = fvt; - } - - if (max_freq != pcpu->policy->cur) { - __cpufreq_driver_target(pcpu->policy, - max_freq, - CPUFREQ_RELATION_H); - for_each_cpu(j, pcpu->policy->cpus) { - pjcpu = &per_cpu(cpuinfo, j); - pjcpu->pol_hispeed_val_time = hvt; - } - } - trace_cpufreq_interactive_setspeed(cpu, - pcpu->target_freq, - pcpu->policy->cur); - - up_read(&pcpu->enable_sem); + up_write(&pcpu->policy->rwsem); } } From 440a57717bcc2a86a5e9800bce62e14d8115a001 Mon Sep 17 00:00:00 2001 From: Daniel Kurtz Date: Thu, 28 May 2015 12:08:11 +0800 Subject: [PATCH 0013/3220] cpufreq: interactive: only apply interactive boost when enabled Only apply the interactive boost when the interactive governor is enabled. This seems like the right thing to do. This was originally reviewed on https://chromium-review.googlesource.com/273501 Change-Id: I5f4a7320683eada099f9a4253e3d6b0f03057fe8 Signed-off-by: Daniel Kurtz Reviewed-by: Douglas Anderson Signed-off-by: Dmitry Torokhov --- drivers/cpufreq/cpufreq_interactive.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 5224e897d460..bd83be39f17b 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -622,9 +622,20 @@ static void cpufreq_interactive_boost(struct cpufreq_interactive_tunables *tunab for_each_online_cpu(i) { pcpu = &per_cpu(cpuinfo, i); - if (tunables != pcpu->policy->governor_data) + + if (!down_read_trylock(&pcpu->enable_sem)) continue; + if (!pcpu->governor_enabled) { + up_read(&pcpu->enable_sem); + continue; + } + + if (tunables != pcpu->policy->governor_data) { + up_read(&pcpu->enable_sem); + continue; + } + spin_lock_irqsave(&pcpu->target_freq_lock, flags[1]); if (pcpu->target_freq < tunables->hispeed_freq) { pcpu->target_freq = tunables->hispeed_freq; @@ -634,6 +645,8 @@ static void cpufreq_interactive_boost(struct cpufreq_interactive_tunables *tunab anyboost = 1; } spin_unlock_irqrestore(&pcpu->target_freq_lock, flags[1]); + + up_read(&pcpu->enable_sem); } spin_unlock_irqrestore(&speedchange_cpumask_lock, flags[0]); From 03fbd079bac71e15a414082cb5aee980ce2935be Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 31 Mar 2016 13:21:09 -0700 Subject: [PATCH 0014/3220] android: base-cfg: Add CONFIG_INET_DIAG_DESTROY Change-Id: I67430b05eca8fd520d2795d3db60faf2ec0fab9e Signed-off-by: Dmitry Shmidt --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 220ddb5304df..fa53af0c37ad 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -29,6 +29,7 @@ CONFIG_INET6_AH=y CONFIG_INET6_ESP=y CONFIG_INET6_IPCOMP=y CONFIG_INET=y +CONFIG_INET_DIAG_DESTROY=y CONFIG_INET_ESP=y CONFIG_INET_XFRM_MODE_TUNNEL=y CONFIG_IP6_NF_FILTER=y From d68354b3b7ef8b8a1d9cf0ba071123ed62fe567f Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Thu, 28 Jan 2016 11:12:25 -0800 Subject: [PATCH 0015/3220] ANDROID: mmc: Add CONFIG_MMC_SIMULATE_MAX_SPEED When CONFIG_MMC_SIMULATE_MAX_SPEED is enabled, Expose max_read_speed, max_write_speed and cache_size default module parameters and sysfs controls to simulate a slow eMMC device. Default values are 0 (off), 0 (off) and 4 MB respectively. Signed-off-by: Mark Salyzyn Bug: 26976972 Change-Id: I342bfbd8b85f9b790e3f0e1e4e51a900ae07e05d --- Documentation/block/00-INDEX | 6 + Documentation/block/mmc-max-speed.txt | 38 ++++ drivers/mmc/card/Kconfig | 12 ++ drivers/mmc/card/block.c | 300 ++++++++++++++++++++++++++ drivers/mmc/card/queue.h | 8 + 5 files changed, 364 insertions(+) create mode 100644 Documentation/block/mmc-max-speed.txt diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index e840b47613f7..bc5148757edb 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -26,3 +26,9 @@ switching-sched.txt - Switching I/O schedulers at runtime writeback_cache_control.txt - Control of volatile write back caches +mmc-max-speed.txt + - eMMC layer speed simulation, related to /sys/block/mmcblk*/ + attributes: + max_read_speed + max_write_speed + cache_size diff --git a/Documentation/block/mmc-max-speed.txt b/Documentation/block/mmc-max-speed.txt new file mode 100644 index 000000000000..3f052b9fb999 --- /dev/null +++ b/Documentation/block/mmc-max-speed.txt @@ -0,0 +1,38 @@ +eMMC Block layer simulation speed controls in /sys/block/mmcblk*/ +=============================================== + +Turned on with CONFIG_MMC_SIMULATE_MAX_SPEED which enables MMC device speed +limiting. Used to test and simulate the behavior of the system when +confronted with a slow MMC. + +Enables max_read_speed, max_write_speed and cache_size attributes and module +default parameters to control the write or read maximum KB/second speed +behaviors. + +NB: There is room for improving the algorithm for aspects tied directly to +eMMC specific behavior. For instance, wear leveling and stalls from an +exhausted erase pool. We would expect that if there was a need to provide +similar speed simulation controls to other types of block devices, aspects of +their behavior are modelled separately (e.g. head seek times, heat assist, +shingling and rotational latency). + +/sys/block/mmcblk0/max_read_speed: + +Number of KB/second reads allowed to the block device. Used to test and +simulate the behavior of the system when confronted with a slow reading MMC. +Set to 0 or "off" to place no speed limit. + +/sys/block/mmcblk0/max_write_speed: + +Number of KB/second writes allowed to the block device. Used to test and +simulate the behavior of the system when confronted with a slow writing MMC. +Set to 0 or "off" to place no speed limit. + +/sys/block/mmcblk0/cache_size: + +Number of MB of high speed memory or high speed SLC cache expected on the +eMMC device being simulated. Used to help simulate the write-back behavior +more accurately. The assumption is the cache has no delay, but draws down +in the background to the MLC/TLC primary store at the max_write_speed rate. +Any write speed delays will show up when the cache is full, or when an I/O +request to flush is issued. diff --git a/drivers/mmc/card/Kconfig b/drivers/mmc/card/Kconfig index 5562308699bc..6142ec1b9dfb 100644 --- a/drivers/mmc/card/Kconfig +++ b/drivers/mmc/card/Kconfig @@ -68,3 +68,15 @@ config MMC_TEST This driver is only of interest to those developing or testing a host driver. Most people should say N here. + +config MMC_SIMULATE_MAX_SPEED + bool "Turn on maximum speed control per block device" + depends on MMC_BLOCK + help + Say Y here to enable MMC device speed limiting. Used to test and + simulate the behavior of the system when confronted with a slow MMC. + + Enables max_read_speed, max_write_speed and cache_size attributes to + control the write or read maximum KB/second speed behaviors. + + If unsure, say N here. diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index f2ce13ab7ae6..c15d3f380524 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -288,6 +288,250 @@ out: return ret; } +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + +static int max_read_speed, max_write_speed, cache_size = 4; + +module_param(max_read_speed, int, S_IRUSR | S_IRGRP); +MODULE_PARM_DESC(max_read_speed, "maximum KB/s read speed 0=off"); +module_param(max_write_speed, int, S_IRUSR | S_IRGRP); +MODULE_PARM_DESC(max_write_speed, "maximum KB/s write speed 0=off"); +module_param(cache_size, int, S_IRUSR | S_IRGRP); +MODULE_PARM_DESC(cache_size, "MB high speed memory or SLC cache"); + +/* + * helper macros and expectations: + * size - unsigned long number of bytes + * jiffies - unsigned long HZ timestamp difference + * speed - unsigned KB/s transfer rate + */ +#define size_and_speed_to_jiffies(size, speed) \ + ((size) * HZ / (speed) / 1024UL) +#define jiffies_and_speed_to_size(jiffies, speed) \ + (((speed) * (jiffies) * 1024UL) / HZ) +#define jiffies_and_size_to_speed(jiffies, size) \ + ((size) * HZ / (jiffies) / 1024UL) + +/* Limits to report warning */ +/* jiffies_and_size_to_speed(10*HZ, queue_max_hw_sectors(q) * 512UL) ~ 25 */ +#define MIN_SPEED(q) 250 /* 10 times faster than a floppy disk */ +#define MAX_SPEED(q) jiffies_and_size_to_speed(1, queue_max_sectors(q) * 512UL) + +#define speed_valid(speed) ((speed) > 0) + +static const char off[] = "off\n"; + +static int max_speed_show(int speed, char *buf) +{ + if (speed) + return scnprintf(buf, PAGE_SIZE, "%uKB/s\n", speed); + else + return scnprintf(buf, PAGE_SIZE, off); +} + +static int max_speed_store(const char *buf, struct request_queue *q) +{ + unsigned int limit, set = 0; + + if (!strncasecmp(off, buf, sizeof(off) - 2)) + return set; + if (kstrtouint(buf, 0, &set) || (set > INT_MAX)) + return -EINVAL; + if (set == 0) + return set; + limit = MAX_SPEED(q); + if (set > limit) + pr_warn("max speed %u ineffective above %u\n", set, limit); + limit = MIN_SPEED(q); + if (set < limit) + pr_warn("max speed %u painful below %u\n", set, limit); + return set; +} + +static ssize_t max_write_speed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int ret = max_speed_show(atomic_read(&md->queue.max_write_speed), buf); + + mmc_blk_put(md); + return ret; +} + +static ssize_t max_write_speed_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int set = max_speed_store(buf, md->queue.queue); + + if (set < 0) { + mmc_blk_put(md); + return set; + } + + atomic_set(&md->queue.max_write_speed, set); + mmc_blk_put(md); + return count; +} + +static const DEVICE_ATTR(max_write_speed, S_IRUGO | S_IWUSR, + max_write_speed_show, max_write_speed_store); + +static ssize_t max_read_speed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int ret = max_speed_show(atomic_read(&md->queue.max_read_speed), buf); + + mmc_blk_put(md); + return ret; +} + +static ssize_t max_read_speed_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + int set = max_speed_store(buf, md->queue.queue); + + if (set < 0) { + mmc_blk_put(md); + return set; + } + + atomic_set(&md->queue.max_read_speed, set); + mmc_blk_put(md); + return count; +} + +static const DEVICE_ATTR(max_read_speed, S_IRUGO | S_IWUSR, + max_read_speed_show, max_read_speed_store); + +static ssize_t cache_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + struct mmc_queue *mq = &md->queue; + int cache_size = atomic_read(&mq->cache_size); + int ret; + + if (!cache_size) + ret = scnprintf(buf, PAGE_SIZE, off); + else { + int speed = atomic_read(&mq->max_write_speed); + + if (!speed_valid(speed)) + ret = scnprintf(buf, PAGE_SIZE, "%uMB\n", cache_size); + else { /* We accept race between cache_jiffies and cache_used */ + unsigned long size = jiffies_and_speed_to_size( + jiffies - mq->cache_jiffies, speed); + long used = atomic_long_read(&mq->cache_used); + + if (size >= used) + size = 0; + else + size = (used - size) * 100 / cache_size + / 1024UL / 1024UL; + + ret = scnprintf(buf, PAGE_SIZE, "%uMB %lu%% used\n", + cache_size, size); + } + } + + mmc_blk_put(md); + return ret; +} + +static ssize_t cache_size_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_blk_data *md; + unsigned int set = 0; + + if (strncasecmp(off, buf, sizeof(off) - 2) + && (kstrtouint(buf, 0, &set) || (set > INT_MAX))) + return -EINVAL; + + md = mmc_blk_get(dev_to_disk(dev)); + atomic_set(&md->queue.cache_size, set); + mmc_blk_put(md); + return count; +} + +static const DEVICE_ATTR(cache_size, S_IRUGO | S_IWUSR, + cache_size_show, cache_size_store); + +/* correct for write-back */ +static long mmc_blk_cache_used(struct mmc_queue *mq, unsigned long waitfor) +{ + long used = 0; + int speed = atomic_read(&mq->max_write_speed); + + if (speed_valid(speed)) { + unsigned long size = jiffies_and_speed_to_size( + waitfor - mq->cache_jiffies, speed); + used = atomic_long_read(&mq->cache_used); + + if (size >= used) + used = 0; + else + used -= size; + } + + atomic_long_set(&mq->cache_used, used); + mq->cache_jiffies = waitfor; + + return used; +} + +static void mmc_blk_simulate_delay( + struct mmc_queue *mq, + struct request *req, + unsigned long waitfor) +{ + int max_speed; + + if (!req) + return; + + max_speed = (rq_data_dir(req) == READ) + ? atomic_read(&mq->max_read_speed) + : atomic_read(&mq->max_write_speed); + if (speed_valid(max_speed)) { + unsigned long bytes = blk_rq_bytes(req); + + if (rq_data_dir(req) != READ) { + int cache_size = atomic_read(&mq->cache_size); + + if (cache_size) { + unsigned long size = cache_size * 1024L * 1024L; + long used = mmc_blk_cache_used(mq, waitfor); + + used += bytes; + atomic_long_set(&mq->cache_used, used); + bytes = 0; + if (used > size) + bytes = used - size; + } + } + waitfor += size_and_speed_to_jiffies(bytes, max_speed); + if (time_is_after_jiffies(waitfor)) { + long msecs = jiffies_to_msecs(waitfor - jiffies); + + if (likely(msecs > 0)) + msleep(msecs); + } + } +} + +#else + +#define mmc_blk_simulate_delay(mq, req, waitfor) + +#endif + static int mmc_blk_open(struct block_device *bdev, fmode_t mode) { struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk); @@ -1264,6 +1508,23 @@ static int mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req) if (ret) ret = -EIO; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + else if (atomic_read(&mq->cache_size)) { + long used = mmc_blk_cache_used(mq, jiffies); + + if (used) { + int speed = atomic_read(&mq->max_write_speed); + + if (speed_valid(speed)) { + unsigned long msecs = jiffies_to_msecs( + size_and_speed_to_jiffies( + used, speed)); + if (msecs) + msleep(msecs); + } + } + } +#endif blk_end_request_all(req, ret); return ret ? 0 : 1; @@ -1943,6 +2204,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc) struct mmc_async_req *areq; const u8 packed_nr = 2; u8 reqs = 0; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + unsigned long waitfor = jiffies; +#endif if (!rqc && !mq->mqrq_prev->req) return 0; @@ -1993,6 +2257,8 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc) */ mmc_blk_reset_success(md, type); + mmc_blk_simulate_delay(mq, rqc, waitfor); + if (mmc_packed_cmd(mq_rq->cmd_type)) { ret = mmc_blk_end_packed_req(mq_rq); break; @@ -2411,6 +2677,14 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md) card->ext_csd.boot_ro_lockable) device_remove_file(disk_to_dev(md->disk), &md->power_ro_lock); +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + device_remove_file(disk_to_dev(md->disk), + &dev_attr_max_write_speed); + device_remove_file(disk_to_dev(md->disk), + &dev_attr_max_read_speed); + device_remove_file(disk_to_dev(md->disk), + &dev_attr_cache_size); +#endif del_gendisk(md->disk); } @@ -2446,6 +2720,24 @@ static int mmc_add_disk(struct mmc_blk_data *md) ret = device_create_file(disk_to_dev(md->disk), &md->force_ro); if (ret) goto force_ro_fail; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + atomic_set(&md->queue.max_write_speed, max_write_speed); + ret = device_create_file(disk_to_dev(md->disk), + &dev_attr_max_write_speed); + if (ret) + goto max_write_speed_fail; + atomic_set(&md->queue.max_read_speed, max_read_speed); + ret = device_create_file(disk_to_dev(md->disk), + &dev_attr_max_read_speed); + if (ret) + goto max_read_speed_fail; + atomic_set(&md->queue.cache_size, cache_size); + atomic_long_set(&md->queue.cache_used, 0); + md->queue.cache_jiffies = jiffies; + ret = device_create_file(disk_to_dev(md->disk), &dev_attr_cache_size); + if (ret) + goto cache_size_fail; +#endif if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) && card->ext_csd.boot_ro_lockable) { @@ -2470,6 +2762,14 @@ static int mmc_add_disk(struct mmc_blk_data *md) return ret; power_ro_lock_fail: +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + device_remove_file(disk_to_dev(md->disk), &dev_attr_cache_size); +cache_size_fail: + device_remove_file(disk_to_dev(md->disk), &dev_attr_max_read_speed); +max_read_speed_fail: + device_remove_file(disk_to_dev(md->disk), &dev_attr_max_write_speed); +max_write_speed_fail: +#endif device_remove_file(disk_to_dev(md->disk), &md->force_ro); force_ro_fail: del_gendisk(md->disk); diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h index 36cddab57d77..d890d8832e21 100644 --- a/drivers/mmc/card/queue.h +++ b/drivers/mmc/card/queue.h @@ -58,6 +58,14 @@ struct mmc_queue { struct mmc_queue_req mqrq[2]; struct mmc_queue_req *mqrq_cur; struct mmc_queue_req *mqrq_prev; +#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED + atomic_t max_write_speed; + atomic_t max_read_speed; + atomic_t cache_size; + /* i/o tracking */ + atomic_long_t cache_used; + unsigned long cache_jiffies; +#endif }; extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *, From a354cef9805bef94bb6fb6bd7887447846fee737 Mon Sep 17 00:00:00 2001 From: Rom Lemarchand Date: Thu, 7 Apr 2016 07:19:34 -0700 Subject: [PATCH 0016/3220] android: base-cfg: enable CONFIG_QUOTA Bug: 28032718 Change-Id: I7cb6b641f72085e69b90dca11d2ea68adcd02390 (cherry picked from commit e1b53a388e9cfcf870520a6899a37456cf1ae2c6) --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index fa53af0c37ad..3f82e9339dad 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -138,6 +138,7 @@ CONFIG_PPP_BSDCOMP=y CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y +CONFIG_QUOTA=y CONFIG_RESOURCE_COUNTERS=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y From 00526cde6b4a4154e4ea528f8a0cea1017abbc1c Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Tue, 5 Apr 2016 13:06:27 -0700 Subject: [PATCH 0017/3220] BACKPORT: selinux: restrict kernel module loading Backport notes: Backport uses kernel_module_from_file not kernel_read_file hook. kernel_read_file replaced kernel_module_from_file in the 4.6 kernel. There are no inode_security_() helper functions (also introduced in 4.6) so the inode lookup is done using the file_inode() helper which is standard for kernel version < 4.6. (Cherry picked from commit 61d612ea731e57dc510472fb746b55cdc017f371) Utilize existing kernel_read_file hook on kernel module load. Add module_load permission to the system class. Enforces restrictions on kernel module origin when calling the finit_module syscall. The hook checks that source type has permission module_load for the target type. Example for finit_module: allow foo bar_file:system module_load; Similarly restrictions are enforced on kernel module loading when calling the init_module syscall. The hook checks that source type has permission module_load with itself as the target object because the kernel module is sourced from the calling process. Example for init_module: allow foo foo:system module_load; Bug: 27824855 Change-Id: I64bf3bd1ab2dc735321160642dc6bbfa996f8068 Signed-off-by: Jeff Vander Stoep Signed-off-by: Paul Moore --- security/selinux/hooks.c | 33 +++++++++++++++++++++++++++++ security/selinux/include/classmap.h | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 7c22a15c7e4b..78d06ff4eeb7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3660,6 +3660,38 @@ static int selinux_kernel_module_request(char *kmod_name) SYSTEM__MODULE_REQUEST, &ad); } +static int selinux_kernel_module_from_file(struct file *file) +{ + struct common_audit_data ad; + struct inode_security_struct *isec; + struct file_security_struct *fsec; + struct inode *inode; + u32 sid = current_sid(); + int rc; + + /* init_module */ + if (file == NULL) + return avc_has_perm(sid, sid, SECCLASS_SYSTEM, + SYSTEM__MODULE_LOAD, NULL); + + /* finit_module */ + ad.type = LSM_AUDIT_DATA_PATH; + ad.u.path = file->f_path; + + inode = file_inode(file); + isec = inode->i_security; + fsec = file->f_security; + + if (sid != fsec->sid) { + rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); + if (rc) + return rc; + } + + return avc_has_perm(sid, isec->sid, SECCLASS_SYSTEM, + SYSTEM__MODULE_LOAD, &ad); +} + static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { return current_has_perm(p, PROCESS__SETPGID); @@ -5950,6 +5982,7 @@ static struct security_hook_list selinux_hooks[] = { LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as), LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as), LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request), + LSM_HOOK_INIT(kernel_module_from_file, selinux_kernel_module_from_file), LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid), LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid), LSM_HOOK_INIT(task_getsid, selinux_task_getsid), diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 5a4eef59aeff..b393d29ae857 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -32,7 +32,7 @@ struct security_class_mapping secclass_map[] = { "setsockcreate", NULL } }, { "system", { "ipc_info", "syslog_read", "syslog_mod", - "syslog_console", "module_request", NULL } }, + "syslog_console", "module_request", "module_load", NULL } }, { "capability", { "chown", "dac_override", "dac_read_search", "fowner", "fsetid", "kill", "setgid", "setuid", "setpcap", From 9f6bbb427fc67e5caceec70741def34234078f97 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 12 Apr 2016 01:19:24 +0530 Subject: [PATCH 0018/3220] ANDROID: base-cfg: enable CONFIG_IP_NF_NAT IP_NF_TARGET_{MASQUERADE,NETMAP,REDIRECT} configs, already enabled in android-base.cfg for tethering, are of no use if CONFIG_IP_NF_NAT is not enabled. Don't rely on platform config for that and enable CONFIG_IP_NF_NAT in android-base.cfg as well. Change-Id: Ic72bcebbd925b142b09539466bf963188c83108a Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 3f82e9339dad..a8b529cae6eb 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -57,6 +57,7 @@ CONFIG_IP_NF_MANGLE=y CONFIG_IP_NF_MATCH_AH=y CONFIG_IP_NF_MATCH_ECN=y CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_NAT=y CONFIG_IP_NF_RAW=y CONFIG_IP_NF_SECURITY=y CONFIG_IP_NF_TARGET_MASQUERADE=y From ea20f758a0f688b713920e1b6c455ab3f0983c9e Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 19 Apr 2016 12:37:31 -0700 Subject: [PATCH 0019/3220] Revert "drivers: switch: remove S_IWUSR from dev_attr" This reverts commit dc66dee02dcd6ea774e3ed4ae32e88b0f3b4bee7. --- drivers/switch/switch_class.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/switch/switch_class.c b/drivers/switch/switch_class.c index e373b625806e..e05fc2591147 100644 --- a/drivers/switch/switch_class.c +++ b/drivers/switch/switch_class.c @@ -54,8 +54,8 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%s\n", sdev->name); } -static DEVICE_ATTR(state, S_IRUGO, state_show, NULL); -static DEVICE_ATTR(name, S_IRUGO, name_show, NULL); +static DEVICE_ATTR(state, S_IRUGO | S_IWUSR, state_show, NULL); +static DEVICE_ATTR(name, S_IRUGO | S_IWUSR, name_show, NULL); void switch_set_state(struct switch_dev *sdev, int state) { From 00c13787f8275d7ff3e232a0e06a3c26478811e0 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 19 Apr 2016 12:37:47 -0700 Subject: [PATCH 0020/3220] Revert "switch: switch class and GPIO drivers." Drivers should use extcon moving forward. Documentation/extcon/porting-android-switch-class describes how to port existing switch class drivers to extcon. This reverts commit e4b8e66e0ae2e78e913d7b86f2507fdb0aa731b4. Change-Id: I5b622c7ab4c0cb9670f8903f259a99888f503c1a --- drivers/Kconfig | 2 - drivers/Makefile | 1 - drivers/switch/Kconfig | 15 --- drivers/switch/Makefile | 4 - drivers/switch/switch_class.c | 174 ---------------------------------- drivers/switch/switch_gpio.c | 172 --------------------------------- include/linux/switch.h | 53 ----------- 7 files changed, 421 deletions(-) delete mode 100644 drivers/switch/Kconfig delete mode 100644 drivers/switch/Makefile delete mode 100644 drivers/switch/switch_class.c delete mode 100644 drivers/switch/switch_gpio.c delete mode 100644 include/linux/switch.h diff --git a/drivers/Kconfig b/drivers/Kconfig index 3c363e559864..d2ac339de85f 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -104,8 +104,6 @@ source "drivers/memstick/Kconfig" source "drivers/leds/Kconfig" -source "drivers/switch/Kconfig" - source "drivers/accessibility/Kconfig" source "drivers/infiniband/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 8acfb530993d..795d0ca714bf 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -122,7 +122,6 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle/ obj-y += mmc/ obj-$(CONFIG_MEMSTICK) += memstick/ obj-y += leds/ -obj-$(CONFIG_SWITCH) += switch/ obj-$(CONFIG_INFINIBAND) += infiniband/ obj-$(CONFIG_SGI_SN) += sn/ obj-y += firmware/ diff --git a/drivers/switch/Kconfig b/drivers/switch/Kconfig deleted file mode 100644 index 19404b6f7778..000000000000 --- a/drivers/switch/Kconfig +++ /dev/null @@ -1,15 +0,0 @@ -menuconfig SWITCH - tristate "Switch class support" - help - Say Y here to enable switch class support. This allows - monitoring switches by userspace via sysfs and uevent. - -if SWITCH - -config SWITCH_GPIO - tristate "GPIO Swith support" - depends on GPIOLIB - help - Say Y here to enable GPIO based switch support. - -endif # SWITCH diff --git a/drivers/switch/Makefile b/drivers/switch/Makefile deleted file mode 100644 index f7606ed4a719..000000000000 --- a/drivers/switch/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -# Switch Class Driver -obj-$(CONFIG_SWITCH) += switch_class.o -obj-$(CONFIG_SWITCH_GPIO) += switch_gpio.o - diff --git a/drivers/switch/switch_class.c b/drivers/switch/switch_class.c deleted file mode 100644 index e05fc2591147..000000000000 --- a/drivers/switch/switch_class.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * drivers/switch/switch_class.c - * - * Copyright (C) 2008 Google, Inc. - * Author: Mike Lockwood - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * -*/ - -#include -#include -#include -#include -#include -#include -#include - -struct class *switch_class; -static atomic_t device_count; - -static ssize_t state_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct switch_dev *sdev = (struct switch_dev *) - dev_get_drvdata(dev); - - if (sdev->print_state) { - int ret = sdev->print_state(sdev, buf); - if (ret >= 0) - return ret; - } - return sprintf(buf, "%d\n", sdev->state); -} - -static ssize_t name_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct switch_dev *sdev = (struct switch_dev *) - dev_get_drvdata(dev); - - if (sdev->print_name) { - int ret = sdev->print_name(sdev, buf); - if (ret >= 0) - return ret; - } - return sprintf(buf, "%s\n", sdev->name); -} - -static DEVICE_ATTR(state, S_IRUGO | S_IWUSR, state_show, NULL); -static DEVICE_ATTR(name, S_IRUGO | S_IWUSR, name_show, NULL); - -void switch_set_state(struct switch_dev *sdev, int state) -{ - char name_buf[120]; - char state_buf[120]; - char *prop_buf; - char *envp[3]; - int env_offset = 0; - int length; - - if (sdev->state != state) { - sdev->state = state; - - prop_buf = (char *)get_zeroed_page(GFP_KERNEL); - if (prop_buf) { - length = name_show(sdev->dev, NULL, prop_buf); - if (length > 0) { - if (prop_buf[length - 1] == '\n') - prop_buf[length - 1] = 0; - snprintf(name_buf, sizeof(name_buf), - "SWITCH_NAME=%s", prop_buf); - envp[env_offset++] = name_buf; - } - length = state_show(sdev->dev, NULL, prop_buf); - if (length > 0) { - if (prop_buf[length - 1] == '\n') - prop_buf[length - 1] = 0; - snprintf(state_buf, sizeof(state_buf), - "SWITCH_STATE=%s", prop_buf); - envp[env_offset++] = state_buf; - } - envp[env_offset] = NULL; - kobject_uevent_env(&sdev->dev->kobj, KOBJ_CHANGE, envp); - free_page((unsigned long)prop_buf); - } else { - printk(KERN_ERR "out of memory in switch_set_state\n"); - kobject_uevent(&sdev->dev->kobj, KOBJ_CHANGE); - } - } -} -EXPORT_SYMBOL_GPL(switch_set_state); - -static int create_switch_class(void) -{ - if (!switch_class) { - switch_class = class_create(THIS_MODULE, "switch"); - if (IS_ERR(switch_class)) - return PTR_ERR(switch_class); - atomic_set(&device_count, 0); - } - - return 0; -} - -int switch_dev_register(struct switch_dev *sdev) -{ - int ret; - - if (!switch_class) { - ret = create_switch_class(); - if (ret < 0) - return ret; - } - - sdev->index = atomic_inc_return(&device_count); - sdev->dev = device_create(switch_class, NULL, - MKDEV(0, sdev->index), NULL, sdev->name); - if (IS_ERR(sdev->dev)) - return PTR_ERR(sdev->dev); - - ret = device_create_file(sdev->dev, &dev_attr_state); - if (ret < 0) - goto err_create_file_1; - ret = device_create_file(sdev->dev, &dev_attr_name); - if (ret < 0) - goto err_create_file_2; - - dev_set_drvdata(sdev->dev, sdev); - sdev->state = 0; - return 0; - -err_create_file_2: - device_remove_file(sdev->dev, &dev_attr_state); -err_create_file_1: - device_destroy(switch_class, MKDEV(0, sdev->index)); - printk(KERN_ERR "switch: Failed to register driver %s\n", sdev->name); - - return ret; -} -EXPORT_SYMBOL_GPL(switch_dev_register); - -void switch_dev_unregister(struct switch_dev *sdev) -{ - device_remove_file(sdev->dev, &dev_attr_name); - device_remove_file(sdev->dev, &dev_attr_state); - device_destroy(switch_class, MKDEV(0, sdev->index)); - dev_set_drvdata(sdev->dev, NULL); -} -EXPORT_SYMBOL_GPL(switch_dev_unregister); - -static int __init switch_class_init(void) -{ - return create_switch_class(); -} - -static void __exit switch_class_exit(void) -{ - class_destroy(switch_class); -} - -module_init(switch_class_init); -module_exit(switch_class_exit); - -MODULE_AUTHOR("Mike Lockwood "); -MODULE_DESCRIPTION("Switch class driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/switch/switch_gpio.c b/drivers/switch/switch_gpio.c deleted file mode 100644 index 621d62d20c99..000000000000 --- a/drivers/switch/switch_gpio.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * drivers/switch/switch_gpio.c - * - * Copyright (C) 2008 Google, Inc. - * Author: Mike Lockwood - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct gpio_switch_data { - struct switch_dev sdev; - unsigned gpio; - const char *name_on; - const char *name_off; - const char *state_on; - const char *state_off; - int irq; - struct work_struct work; -}; - -static void gpio_switch_work(struct work_struct *work) -{ - int state; - struct gpio_switch_data *data = - container_of(work, struct gpio_switch_data, work); - - state = gpio_get_value(data->gpio); - switch_set_state(&data->sdev, state); -} - -static irqreturn_t gpio_irq_handler(int irq, void *dev_id) -{ - struct gpio_switch_data *switch_data = - (struct gpio_switch_data *)dev_id; - - schedule_work(&switch_data->work); - return IRQ_HANDLED; -} - -static ssize_t switch_gpio_print_state(struct switch_dev *sdev, char *buf) -{ - struct gpio_switch_data *switch_data = - container_of(sdev, struct gpio_switch_data, sdev); - const char *state; - if (switch_get_state(sdev)) - state = switch_data->state_on; - else - state = switch_data->state_off; - - if (state) - return sprintf(buf, "%s\n", state); - return -1; -} - -static int gpio_switch_probe(struct platform_device *pdev) -{ - struct gpio_switch_platform_data *pdata = pdev->dev.platform_data; - struct gpio_switch_data *switch_data; - int ret = 0; - - if (!pdata) - return -EBUSY; - - switch_data = kzalloc(sizeof(struct gpio_switch_data), GFP_KERNEL); - if (!switch_data) - return -ENOMEM; - - switch_data->sdev.name = pdata->name; - switch_data->gpio = pdata->gpio; - switch_data->name_on = pdata->name_on; - switch_data->name_off = pdata->name_off; - switch_data->state_on = pdata->state_on; - switch_data->state_off = pdata->state_off; - switch_data->sdev.print_state = switch_gpio_print_state; - - ret = switch_dev_register(&switch_data->sdev); - if (ret < 0) - goto err_switch_dev_register; - - ret = gpio_request(switch_data->gpio, pdev->name); - if (ret < 0) - goto err_request_gpio; - - ret = gpio_direction_input(switch_data->gpio); - if (ret < 0) - goto err_set_gpio_input; - - INIT_WORK(&switch_data->work, gpio_switch_work); - - switch_data->irq = gpio_to_irq(switch_data->gpio); - if (switch_data->irq < 0) { - ret = switch_data->irq; - goto err_detect_irq_num_failed; - } - - ret = request_irq(switch_data->irq, gpio_irq_handler, - IRQF_TRIGGER_LOW, pdev->name, switch_data); - if (ret < 0) - goto err_request_irq; - - /* Perform initial detection */ - gpio_switch_work(&switch_data->work); - - return 0; - -err_request_irq: -err_detect_irq_num_failed: -err_set_gpio_input: - gpio_free(switch_data->gpio); -err_request_gpio: - switch_dev_unregister(&switch_data->sdev); -err_switch_dev_register: - kfree(switch_data); - - return ret; -} - -static int gpio_switch_remove(struct platform_device *pdev) -{ - struct gpio_switch_data *switch_data = platform_get_drvdata(pdev); - - cancel_work_sync(&switch_data->work); - gpio_free(switch_data->gpio); - switch_dev_unregister(&switch_data->sdev); - kfree(switch_data); - - return 0; -} - -static struct platform_driver gpio_switch_driver = { - .probe = gpio_switch_probe, - .remove = gpio_switch_remove, - .driver = { - .name = "switch-gpio", - .owner = THIS_MODULE, - }, -}; - -static int __init gpio_switch_init(void) -{ - return platform_driver_register(&gpio_switch_driver); -} - -static void __exit gpio_switch_exit(void) -{ - platform_driver_unregister(&gpio_switch_driver); -} - -module_init(gpio_switch_init); -module_exit(gpio_switch_exit); - -MODULE_AUTHOR("Mike Lockwood "); -MODULE_DESCRIPTION("GPIO Switch driver"); -MODULE_LICENSE("GPL"); diff --git a/include/linux/switch.h b/include/linux/switch.h deleted file mode 100644 index 3e4c748e343a..000000000000 --- a/include/linux/switch.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Switch class driver - * - * Copyright (C) 2008 Google, Inc. - * Author: Mike Lockwood - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * -*/ - -#ifndef __LINUX_SWITCH_H__ -#define __LINUX_SWITCH_H__ - -struct switch_dev { - const char *name; - struct device *dev; - int index; - int state; - - ssize_t (*print_name)(struct switch_dev *sdev, char *buf); - ssize_t (*print_state)(struct switch_dev *sdev, char *buf); -}; - -struct gpio_switch_platform_data { - const char *name; - unsigned gpio; - - /* if NULL, switch_dev.name will be printed */ - const char *name_on; - const char *name_off; - /* if NULL, "0" or "1" will be printed */ - const char *state_on; - const char *state_off; -}; - -extern int switch_dev_register(struct switch_dev *sdev); -extern void switch_dev_unregister(struct switch_dev *sdev); - -static inline int switch_get_state(struct switch_dev *sdev) -{ - return sdev->state; -} - -extern void switch_set_state(struct switch_dev *sdev, int state); - -#endif /* __LINUX_SWITCH_H__ */ From a9de5129787db0d135eab61359b820c783fb5676 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 19 Apr 2016 12:44:42 -0700 Subject: [PATCH 0021/3220] android: base-cfg: remove CONFIG_SWITCH Change-Id: I3fd1aa7a54fe3a8d3ad5537cbc61386e52f41ea0 Signed-off-by: Dmitry Shmidt --- android/configs/android-base.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index a8b529cae6eb..304f1d4fd7c4 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -148,7 +148,6 @@ CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SETEND_EMULATION=y CONFIG_STAGING=y -CONFIG_SWITCH=y CONFIG_SWP_EMULATION=y CONFIG_SYNC=y CONFIG_TUN=y From 24c4a0f75c51c1f42cc0125a024b7c262e664ff7 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Apr 2016 15:43:29 -0700 Subject: [PATCH 0022/3220] Revert "tcp: Fix IPV6 module build errors" This reverts commit 3823c8136f2170b3ac5e6a5f8b857746a786e845. --- net/ipv4/tcp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b799f20387c6..c0e0fd2dd378 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3260,7 +3260,7 @@ static int tcp_is_local(struct net *net, __be32 addr) { return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK); } -#if defined(CONFIG_IPV6) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static int tcp_is_local6(struct net *net, struct in6_addr *addr) { struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0); return rt6 && rt6->dst.dev && (rt6->dst.dev->flags & IFF_LOOPBACK); @@ -3328,7 +3328,7 @@ restart: continue; } -#if defined(CONFIG_IPV6) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (family == AF_INET6) { struct in6_addr *s6; if (!inet->pinet6) @@ -3365,4 +3365,3 @@ restart: return 0; } -EXPORT_SYMBOL_GPL(tcp_nuke_addr); From 215856823c282d0339acdad8c7f154abc30d1027 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Apr 2016 15:43:58 -0700 Subject: [PATCH 0023/3220] Revert "Don't kill IPv4 sockets when killing IPv6 sockets was requested." This reverts commit 8bf4413b4f54e24120b90ecbfee426beeddc3ff0. --- net/ipv4/tcp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c0e0fd2dd378..9819762c4a92 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3331,8 +3331,6 @@ restart: #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (family == AF_INET6) { struct in6_addr *s6; - if (!inet->pinet6) - continue; s6 = &sk->sk_v6_rcv_saddr; if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED) From 9826b2ec8383b14a82f9d1d545a8573986c16abe Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Apr 2016 15:44:11 -0700 Subject: [PATCH 0024/3220] Revert "net: fix crash in tcp_nuke_addr()" This reverts commit 08f7c4280cd5efe9e274240c42177f459431bac2. --- net/ipv4/tcp.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9819762c4a92..9b19742cbf42 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3301,19 +3301,8 @@ restart: sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_state == TCP_TIME_WAIT) { - /* - * Sockets that are in TIME_WAIT state are - * instances of lightweight inet_timewait_sock, - * we should simply skip them (or we'll try to - * access non-existing fields and crash). - */ - continue; - } - if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT) continue; - if (sock_flag(sk, SOCK_DEAD)) continue; From 20de1c6bc4d1b4636314fd1a372533b45af809b5 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Apr 2016 15:44:25 -0700 Subject: [PATCH 0025/3220] Revert "net: fix iterating over hashtable in tcp_nuke_addr()" This reverts commit 4747299b2c8e8778927b3df0501023d76fe4f2d5. --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9b19742cbf42..854e606f3bce 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3291,7 +3291,7 @@ int tcp_nuke_addr(struct net *net, struct sockaddr *addr) return -EAFNOSUPPORT; } - for (bucket = 0; bucket <= tcp_hashinfo.ehash_mask; bucket++) { + for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) { struct hlist_nulls_node *node; struct sock *sk; spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket); From 8591a83caea440173feca9008d2c37898bd7dcba Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Thu, 21 Apr 2016 15:47:01 -0700 Subject: [PATCH 0026/3220] Revert "net: socket ioctl to reset connections matching local address" Use SOCK_DESTROY from now instead of SIOCKILLADDR This reverts commit 38f0ec724f5306c81130ca9343c856aa37a76d54. Change-Id: I2dcd833b66c88a48de8978dce9d72ab78f9af549 --- include/net/tcp.h | 2 - include/uapi/linux/sockios.h | 1 - net/ipv4/af_inet.c | 1 - net/ipv4/devinet.c | 8 +-- net/ipv4/tcp.c | 105 ----------------------------------- net/ipv6/af_inet6.c | 17 ------ 6 files changed, 1 insertion(+), 133 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 984e59d99deb..bcce99a951f8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1681,8 +1681,6 @@ static inline bool tcp_stream_memory_free(const struct sock *sk) return notsent_bytes < tcp_notsent_lowat(tp); } -extern int tcp_nuke_addr(struct net *net, struct sockaddr *addr); - #ifdef CONFIG_PROC_FS int tcp4_proc_init(void); void tcp4_proc_exit(void); diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h index 623e9aab645e..e888b1aed69f 100644 --- a/include/uapi/linux/sockios.h +++ b/include/uapi/linux/sockios.h @@ -65,7 +65,6 @@ #define SIOCDIFADDR 0x8936 /* delete PA address */ #define SIOCSIFHWBROADCAST 0x8937 /* set hardware broadcast addr */ #define SIOCGIFCOUNT 0x8938 /* get number of devices */ -#define SIOCKILLADDR 0x8939 /* kill sockets with this local addr */ #define SIOCGIFBR 0x8940 /* Bridging support */ #define SIOCSIFBR 0x8941 /* Set bridging options */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 671eb0092915..eb12bd0ff9d3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -886,7 +886,6 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFPFLAGS: case SIOCGIFPFLAGS: case SIOCSIFFLAGS: - case SIOCKILLADDR: err = devinet_ioctl(net, cmd, (void __user *)arg); break; default: diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3792624af316..cebd9d31e65a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -59,7 +59,6 @@ #include #include -#include #include #include #include @@ -965,7 +964,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ - case SIOCKILLADDR: /* Nuke all sockets on this address */ ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; @@ -1017,8 +1015,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) } ret = -EADDRNOTAVAIL; - if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS - && cmd != SIOCKILLADDR) + if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; switch (cmd) { @@ -1145,9 +1142,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) inet_insert_ifa(ifa); } break; - case SIOCKILLADDR: /* Nuke all connections on this address */ - ret = tcp_nuke_addr(net, (struct sockaddr *) sin); - break; } done: rtnl_unlock(); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 854e606f3bce..e0921748b2c0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -276,9 +276,6 @@ #include #include #include -#include -#include -#include #include #include @@ -3250,105 +3247,3 @@ void __init tcp_init(void) BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); } - -static int tcp_is_local(struct net *net, __be32 addr) { - struct rtable *rt; - struct flowi4 fl4 = { .daddr = addr }; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR_OR_NULL(rt)) - return 0; - return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK); -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static int tcp_is_local6(struct net *net, struct in6_addr *addr) { - struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0); - return rt6 && rt6->dst.dev && (rt6->dst.dev->flags & IFF_LOOPBACK); -} -#endif - -/* - * tcp_nuke_addr - destroy all sockets on the given local address - * if local address is the unspecified address (0.0.0.0 or ::), destroy all - * sockets with local addresses that are not configured. - */ -int tcp_nuke_addr(struct net *net, struct sockaddr *addr) -{ - int family = addr->sa_family; - unsigned int bucket; - - struct in_addr *in; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - struct in6_addr *in6 = NULL; -#endif - if (family == AF_INET) { - in = &((struct sockaddr_in *)addr)->sin_addr; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - } else if (family == AF_INET6) { - in6 = &((struct sockaddr_in6 *)addr)->sin6_addr; -#endif - } else { - return -EAFNOSUPPORT; - } - - for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) { - struct hlist_nulls_node *node; - struct sock *sk; - spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket); - -restart: - spin_lock_bh(lock); - sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) { - struct inet_sock *inet = inet_sk(sk); - - if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT) - continue; - if (sock_flag(sk, SOCK_DEAD)) - continue; - - if (family == AF_INET) { - __be32 s4 = inet->inet_rcv_saddr; - if (s4 == LOOPBACK4_IPV6) - continue; - - if (in->s_addr != s4 && - !(in->s_addr == INADDR_ANY && - !tcp_is_local(net, s4))) - continue; - } - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - if (family == AF_INET6) { - struct in6_addr *s6; - - s6 = &sk->sk_v6_rcv_saddr; - if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED) - continue; - - if (!ipv6_addr_equal(in6, s6) && - !(ipv6_addr_equal(in6, &in6addr_any) && - !tcp_is_local6(net, s6))) - continue; - } -#endif - - sock_hold(sk); - spin_unlock_bh(lock); - - local_bh_disable(); - bh_lock_sock(sk); - sk->sk_err = ETIMEDOUT; - sk->sk_error_report(sk); - - tcp_done(sk); - bh_unlock_sock(sk); - local_bh_enable(); - sock_put(sk); - - goto restart; - } - spin_unlock_bh(lock); - } - - return 0; -} diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 99fccad391e0..d9b25bd17bf1 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -495,21 +495,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, } EXPORT_SYMBOL(inet6_getname); -int inet6_killaddr_ioctl(struct net *net, void __user *arg) { - struct in6_ifreq ireq; - struct sockaddr_in6 sin6; - - if (!capable(CAP_NET_ADMIN)) - return -EACCES; - - if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) - return -EFAULT; - - sin6.sin6_family = AF_INET6; - sin6.sin6_addr = ireq.ifr6_addr; - return tcp_nuke_addr(net, (struct sockaddr *) &sin6); -} - int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; @@ -533,8 +518,6 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return addrconf_del_ifaddr(net, (void __user *) arg); case SIOCSIFDSTADDR: return addrconf_set_dstaddr(net, (void __user *) arg); - case SIOCKILLADDR: - return inet6_killaddr_ioctl(net, (void __user *) arg); default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; From a7eabf90aea68842b735193f0df703f659b45715 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 16:58:20 +0530 Subject: [PATCH 0027/3220] Revert "misc: uid_stat: Include linux/atomic.h instead of asm/atomic.h" This reverts commit 8d3a6c1538fb021448c4f6381f6191061f947ba1. This series of patches revert AOSP UID_STAT and NET_ACTIVITY_STATS drivers. I could not find any meaningful usage of these interfaces in AOSP master. UID_STAT driver expose "/proc/uid_stat/*" interfaces but it is only used in AOSP master as in what appears be an out of date bandwidth test in frameworks/base and in somewhat recent battery utils test in external/chromium-trace project. NET_ACTIVITY_STATS driver expose "/proc/net/stat/activity" interface but I can not track its usage anywhere in AOSP at all. Signed-off-by: Amit Pundir --- drivers/misc/uid_stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 185c69c9738a..8b8c9a22360b 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -13,7 +13,7 @@ * */ -#include +#include #include #include From cdb6973ae15c5f62947ea7afc29fc1175ecb3172 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:00:08 +0530 Subject: [PATCH 0028/3220] Revert "misc seq_printf fixes for 4.4" This reverts commit 5c7566a29bff14166d952f2ea525d5231546f821. This patch revert some changes in net/netfilter/xt_qtaguid.c as well. I'll submit another patch to restore those changes. Signed-off-by: Amit Pundir --- drivers/misc/uid_stat.c | 3 +-- net/activity_stats.c | 7 ++++--- net/netfilter/xt_qtaguid.c | 5 +++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 8b8c9a22360b..4766c1f83b94 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -55,8 +55,7 @@ static int uid_stat_atomic_int_show(struct seq_file *m, void *v) atomic_t *counter = m->private; bytes = (unsigned int) (atomic_read(counter) + INT_MIN); - seq_printf(m, "%u\n", bytes); - return seq_has_overflowed(m) ? -ENOSPC : 0; + return seq_printf(m, "%u\n", bytes); } static int uid_stat_read_atomic_int_open(struct inode *inode, struct file *file) diff --git a/net/activity_stats.c b/net/activity_stats.c index 3bf92d80b8b9..4609ce2043eb 100644 --- a/net/activity_stats.c +++ b/net/activity_stats.c @@ -63,13 +63,14 @@ void activity_stats_update(void) static int activity_stats_show(struct seq_file *m, void *v) { int i; + int ret; seq_printf(m, "Min Bucket(sec) Count\n"); for (i = 0; i < BUCKET_MAX; i++) { - seq_printf(m, "%15d %lu\n", 1 << i, activity_stats[i]); - if (seq_has_overflowed(m)) - return -ENOSPC; + ret = seq_printf(m, "%15d %lu\n", 1 << i, activity_stats[i]); + if (ret) + return ret; } return 0; diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index e1442bfb668d..04bb081adde8 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -2543,6 +2543,7 @@ static void pp_stats_header(struct seq_file *m) static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, int cnt_set) { + int ret; struct data_counters *cnts; tag_t tag = ts_entry->tn.tag; uid_t stat_uid = get_uid_from_tag(tag); @@ -2561,7 +2562,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, } ppi->item_index++; cnts = &ts_entry->counters; - seq_printf(m, "%d %s 0x%llx %u %u " + ret = seq_printf(m, "%d %s 0x%llx %u %u " "%llu %llu " "%llu %llu " "%llu %llu " @@ -2591,7 +2592,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets, cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes, cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets); - return seq_has_overflowed(m) ? -ENOSPC : 1; + return ret ?: 1; } static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry) From 1ada37dc073ab6e49fd71d2fdc2be2e4f87c56cd Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:00:31 +0530 Subject: [PATCH 0029/3220] Revert "misc: uidstat: Remove use of obsolete create_proc_read_entry api" This reverts commit fccab646d33381af63e4f4c0d4f309a1d2b4b0c3. Signed-off-by: Amit Pundir --- drivers/misc/uid_stat.c | 50 ++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 4766c1f83b94..509822c81e97 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -49,26 +48,41 @@ static struct uid_stat *find_uid_stat(uid_t uid) { return NULL; } -static int uid_stat_atomic_int_show(struct seq_file *m, void *v) +static int tcp_snd_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) { + int len; unsigned int bytes; - atomic_t *counter = m->private; + char *p = page; + struct uid_stat *uid_entry = (struct uid_stat *) data; + if (!data) + return 0; - bytes = (unsigned int) (atomic_read(counter) + INT_MIN); - return seq_printf(m, "%u\n", bytes); + bytes = (unsigned int) (atomic_read(&uid_entry->tcp_snd) + INT_MIN); + p += sprintf(p, "%u\n", bytes); + len = (p - page) - off; + *eof = (len <= count) ? 1 : 0; + *start = page + off; + return len; } -static int uid_stat_read_atomic_int_open(struct inode *inode, struct file *file) +static int tcp_rcv_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) { - return single_open(file, uid_stat_atomic_int_show, PDE_DATA(inode)); -} + int len; + unsigned int bytes; + char *p = page; + struct uid_stat *uid_entry = (struct uid_stat *) data; + if (!data) + return 0; -static const struct file_operations uid_stat_read_atomic_int_fops = { - .open = uid_stat_read_atomic_int_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; + bytes = (unsigned int) (atomic_read(&uid_entry->tcp_rcv) + INT_MIN); + p += sprintf(p, "%u\n", bytes); + len = (p - page) - off; + *eof = (len <= count) ? 1 : 0; + *start = page + off; + return len; +} /* Create a new entry for tracking the specified uid. */ static struct uid_stat *create_stat(uid_t uid) { @@ -95,11 +109,11 @@ static void create_stat_proc(struct uid_stat *new_uid) entry = proc_mkdir(uid_s, parent); /* Keep reference to uid_stat so we know what uid to read stats from. */ - proc_create_data("tcp_snd", S_IRUGO, entry, - &uid_stat_read_atomic_int_fops, &new_uid->tcp_snd); + create_proc_read_entry("tcp_snd", S_IRUGO, entry , tcp_snd_read_proc, + (void *) new_uid); - proc_create_data("tcp_rcv", S_IRUGO, entry, - &uid_stat_read_atomic_int_fops, &new_uid->tcp_rcv); + create_proc_read_entry("tcp_rcv", S_IRUGO, entry, tcp_rcv_read_proc, + (void *) new_uid); } static struct uid_stat *find_or_create_uid_stat(uid_t uid) From cded5596a31db1b1e7fa914b4b47fb488dbd819c Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:00:43 +0530 Subject: [PATCH 0030/3220] Revert "misc: uidstat: avoid create_stat() race and blockage." This reverts commit f7a812174033fe620509e6e8ca7022abd924b1c4. Signed-off-by: Amit Pundir --- drivers/misc/uid_stat.c | 52 +++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 509822c81e97..2141124a6c12 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -38,13 +38,17 @@ struct uid_stat { }; static struct uid_stat *find_uid_stat(uid_t uid) { + unsigned long flags; struct uid_stat *entry; + spin_lock_irqsave(&uid_lock, flags); list_for_each_entry(entry, &uid_list, link) { if (entry->uid == uid) { + spin_unlock_irqrestore(&uid_lock, flags); return entry; } } + spin_unlock_irqrestore(&uid_lock, flags); return NULL; } @@ -86,10 +90,13 @@ static int tcp_rcv_read_proc(char *page, char **start, off_t off, /* Create a new entry for tracking the specified uid. */ static struct uid_stat *create_stat(uid_t uid) { + unsigned long flags; + char uid_s[32]; struct uid_stat *new_uid; + struct proc_dir_entry *entry; + /* Create the uid stat struct and append it to the list. */ - new_uid = kmalloc(sizeof(struct uid_stat), GFP_ATOMIC); - if (!new_uid) + if ((new_uid = kmalloc(sizeof(struct uid_stat), GFP_KERNEL)) == NULL) return NULL; new_uid->uid = uid; @@ -97,15 +104,11 @@ static struct uid_stat *create_stat(uid_t uid) { atomic_set(&new_uid->tcp_rcv, INT_MIN); atomic_set(&new_uid->tcp_snd, INT_MIN); + spin_lock_irqsave(&uid_lock, flags); list_add_tail(&new_uid->link, &uid_list); - return new_uid; -} + spin_unlock_irqrestore(&uid_lock, flags); -static void create_stat_proc(struct uid_stat *new_uid) -{ - char uid_s[32]; - struct proc_dir_entry *entry; - sprintf(uid_s, "%d", new_uid->uid); + sprintf(uid_s, "%d", uid); entry = proc_mkdir(uid_s, parent); /* Keep reference to uid_stat so we know what uid to read stats from. */ @@ -114,31 +117,17 @@ static void create_stat_proc(struct uid_stat *new_uid) create_proc_read_entry("tcp_rcv", S_IRUGO, entry, tcp_rcv_read_proc, (void *) new_uid); -} -static struct uid_stat *find_or_create_uid_stat(uid_t uid) -{ - struct uid_stat *entry; - unsigned long flags; - spin_lock_irqsave(&uid_lock, flags); - entry = find_uid_stat(uid); - if (entry) { - spin_unlock_irqrestore(&uid_lock, flags); - return entry; - } - entry = create_stat(uid); - spin_unlock_irqrestore(&uid_lock, flags); - if (entry) - create_stat_proc(entry); - return entry; + return new_uid; } int uid_stat_tcp_snd(uid_t uid, int size) { struct uid_stat *entry; activity_stats_update(); - entry = find_or_create_uid_stat(uid); - if (!entry) - return -1; + if ((entry = find_uid_stat(uid)) == NULL && + ((entry = create_stat(uid)) == NULL)) { + return -1; + } atomic_add(size, &entry->tcp_snd); return 0; } @@ -146,9 +135,10 @@ int uid_stat_tcp_snd(uid_t uid, int size) { int uid_stat_tcp_rcv(uid_t uid, int size) { struct uid_stat *entry; activity_stats_update(); - entry = find_or_create_uid_stat(uid); - if (!entry) - return -1; + if ((entry = find_uid_stat(uid)) == NULL && + ((entry = create_stat(uid)) == NULL)) { + return -1; + } atomic_add(size, &entry->tcp_rcv); return 0; } From 5c0d8ae10ab777417eefc39cd5f84d64c6f34622 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:00:57 +0530 Subject: [PATCH 0031/3220] Revert "net: activity_stats: Stop using obsolete create_proc_read_entry api" This reverts commit 7c121720fa14889d59e933aad0a8b9ce948a39ae. Signed-off-by: Amit Pundir --- net/activity_stats.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/net/activity_stats.c b/net/activity_stats.c index 4609ce2043eb..8a3e93470069 100644 --- a/net/activity_stats.c +++ b/net/activity_stats.c @@ -15,7 +15,6 @@ */ #include -#include #include #include @@ -60,20 +59,29 @@ void activity_stats_update(void) spin_unlock_irqrestore(&activity_lock, flags); } -static int activity_stats_show(struct seq_file *m, void *v) +static int activity_stats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) { int i; - int ret; + int len; + char *p = page; - seq_printf(m, "Min Bucket(sec) Count\n"); + /* Only print if offset is 0, or we have enough buffer space */ + if (off || count < (30 * BUCKET_MAX + 22)) + return -ENOMEM; + + len = snprintf(p, count, "Min Bucket(sec) Count\n"); + count -= len; + p += len; for (i = 0; i < BUCKET_MAX; i++) { - ret = seq_printf(m, "%15d %lu\n", 1 << i, activity_stats[i]); - if (ret) - return ret; + len = snprintf(p, count, "%15d %lu\n", 1 << i, activity_stats[i]); + count -= len; + p += len; } + *eof = 1; - return 0; + return p - page; } static int activity_stats_notifier(struct notifier_block *nb, @@ -92,26 +100,14 @@ static int activity_stats_notifier(struct notifier_block *nb, return 0; } -static int activity_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, activity_stats_show, PDE_DATA(inode)); -} - -static const struct file_operations activity_stats_fops = { - .open = activity_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static struct notifier_block activity_stats_notifier_block = { .notifier_call = activity_stats_notifier, }; static int __init activity_stats_init(void) { - proc_create("activity", S_IRUGO, - init_net.proc_net_stat, &activity_stats_fops); + create_proc_read_entry("activity", S_IRUGO, + init_net.proc_net_stat, activity_stats_read_proc, NULL); return register_pm_notifier(&activity_stats_notifier_block); } From 42d9422a803afc7c1db3502787fdbe75577bc14d Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:01:08 +0530 Subject: [PATCH 0032/3220] Revert "net: activity_stats: Add statistics for network transmission activity" This reverts commit afedd7beba14385fd797166751fde39e0f52cf72. Signed-off-by: Amit Pundir --- drivers/misc/uid_stat.c | 3 - include/net/activity_stats.h | 25 -------- net/Kconfig | 8 --- net/Makefile | 1 - net/activity_stats.c | 115 ----------------------------------- 5 files changed, 152 deletions(-) delete mode 100644 include/net/activity_stats.h delete mode 100644 net/activity_stats.c diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c index 2141124a6c12..e6760b56083c 100644 --- a/drivers/misc/uid_stat.c +++ b/drivers/misc/uid_stat.c @@ -24,7 +24,6 @@ #include #include #include -#include static DEFINE_SPINLOCK(uid_lock); static LIST_HEAD(uid_list); @@ -123,7 +122,6 @@ static struct uid_stat *create_stat(uid_t uid) { int uid_stat_tcp_snd(uid_t uid, int size) { struct uid_stat *entry; - activity_stats_update(); if ((entry = find_uid_stat(uid)) == NULL && ((entry = create_stat(uid)) == NULL)) { return -1; @@ -134,7 +132,6 @@ int uid_stat_tcp_snd(uid_t uid, int size) { int uid_stat_tcp_rcv(uid_t uid, int size) { struct uid_stat *entry; - activity_stats_update(); if ((entry = find_uid_stat(uid)) == NULL && ((entry = create_stat(uid)) == NULL)) { return -1; diff --git a/include/net/activity_stats.h b/include/net/activity_stats.h deleted file mode 100644 index 10e4c1506eeb..000000000000 --- a/include/net/activity_stats.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (C) 2010 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * Author: Mike Chan (mike@android.com) - */ - -#ifndef __activity_stats_h -#define __activity_stats_h - -#ifdef CONFIG_NET_ACTIVITY_STATS -void activity_stats_update(void); -#else -#define activity_stats_update(void) {} -#endif - -#endif /* _NET_ACTIVITY_STATS_H */ diff --git a/net/Kconfig b/net/Kconfig index 043fe1dc0860..ce9585cf343a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -92,14 +92,6 @@ config ANDROID_PARANOID_NETWORK help none -config NET_ACTIVITY_STATS - bool "Network activity statistics tracking" - default y - help - Network activity statistics are useful for tracking wireless - modem activity on 2G, 3G, 4G wireless networks. Counts number of - transmissions and groups them in specified time buckets. - config NETWORK_SECMARK bool "Security Marking" help diff --git a/net/Makefile b/net/Makefile index eeb9d5db454f..a5d04098dfce 100644 --- a/net/Makefile +++ b/net/Makefile @@ -77,4 +77,3 @@ endif ifneq ($(CONFIG_NET_L3_MASTER_DEV),) obj-y += l3mdev/ endif -obj-$(CONFIG_NET_ACTIVITY_STATS) += activity_stats.o diff --git a/net/activity_stats.c b/net/activity_stats.c deleted file mode 100644 index 8a3e93470069..000000000000 --- a/net/activity_stats.c +++ /dev/null @@ -1,115 +0,0 @@ -/* net/activity_stats.c - * - * Copyright (C) 2010 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * Author: Mike Chan (mike@android.com) - */ - -#include -#include -#include - -/* - * Track transmission rates in buckets (power of 2). - * 1,2,4,8...512 seconds. - * - * Buckets represent the count of network transmissions at least - * N seconds apart, where N is 1 << bucket index. - */ -#define BUCKET_MAX 10 - -/* Track network activity frequency */ -static unsigned long activity_stats[BUCKET_MAX]; -static ktime_t last_transmit; -static ktime_t suspend_time; -static DEFINE_SPINLOCK(activity_lock); - -void activity_stats_update(void) -{ - int i; - unsigned long flags; - ktime_t now; - s64 delta; - - spin_lock_irqsave(&activity_lock, flags); - now = ktime_get(); - delta = ktime_to_ns(ktime_sub(now, last_transmit)); - - for (i = BUCKET_MAX - 1; i >= 0; i--) { - /* - * Check if the time delta between network activity is within the - * minimum bucket range. - */ - if (delta < (1000000000ULL << i)) - continue; - - activity_stats[i]++; - last_transmit = now; - break; - } - spin_unlock_irqrestore(&activity_lock, flags); -} - -static int activity_stats_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int i; - int len; - char *p = page; - - /* Only print if offset is 0, or we have enough buffer space */ - if (off || count < (30 * BUCKET_MAX + 22)) - return -ENOMEM; - - len = snprintf(p, count, "Min Bucket(sec) Count\n"); - count -= len; - p += len; - - for (i = 0; i < BUCKET_MAX; i++) { - len = snprintf(p, count, "%15d %lu\n", 1 << i, activity_stats[i]); - count -= len; - p += len; - } - *eof = 1; - - return p - page; -} - -static int activity_stats_notifier(struct notifier_block *nb, - unsigned long event, void *dummy) -{ - switch (event) { - case PM_SUSPEND_PREPARE: - suspend_time = ktime_get_real(); - break; - - case PM_POST_SUSPEND: - suspend_time = ktime_sub(ktime_get_real(), suspend_time); - last_transmit = ktime_sub(last_transmit, suspend_time); - } - - return 0; -} - -static struct notifier_block activity_stats_notifier_block = { - .notifier_call = activity_stats_notifier, -}; - -static int __init activity_stats_init(void) -{ - create_proc_read_entry("activity", S_IRUGO, - init_net.proc_net_stat, activity_stats_read_proc, NULL); - return register_pm_notifier(&activity_stats_notifier_block); -} - -subsys_initcall(activity_stats_init); - From ece28ad441409646dc6330b06d347465d2730feb Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 17:08:15 +0530 Subject: [PATCH 0033/3220] Revert "misc: uidstat: Adding uid stat driver to collect network statistics." This reverts commit 6b6d5fbf9ae567aefb58099a30bbb6d25fa8925b. Signed-off-by: Amit Pundir --- drivers/misc/Kconfig | 4 - drivers/misc/Makefile | 1 - drivers/misc/uid_stat.c | 153 --------------------------------------- include/linux/uid_stat.h | 29 -------- net/ipv4/tcp.c | 14 ---- 5 files changed, 201 deletions(-) delete mode 100644 drivers/misc/uid_stat.c delete mode 100644 include/linux/uid_stat.h diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 98c020b560ac..2763d6b3b063 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -412,10 +412,6 @@ config TI_DAC7512 This driver can also be built as a module. If so, the module will be called ti_dac7512. -config UID_STAT - bool "UID based statistics tracking exported to /proc/uid_stat" - default n - config VMWARE_BALLOON tristate "VMware Balloon Driver" depends on VMWARE_VMCI && X86 && HYPERVISOR_GUEST diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 24483a6caa6b..e5142b836aee 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -35,7 +35,6 @@ obj-$(CONFIG_ISL29020) += isl29020.o obj-$(CONFIG_SENSORS_TSL2550) += tsl2550.o obj-$(CONFIG_DS1682) += ds1682.o obj-$(CONFIG_TI_DAC7512) += ti_dac7512.o -obj-$(CONFIG_UID_STAT) += uid_stat.o obj-$(CONFIG_C2PORT) += c2port/ obj-$(CONFIG_HMC6352) += hmc6352.o obj-y += eeprom/ diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c deleted file mode 100644 index e6760b56083c..000000000000 --- a/drivers/misc/uid_stat.c +++ /dev/null @@ -1,153 +0,0 @@ -/* drivers/misc/uid_stat.c - * - * Copyright (C) 2008 - 2009 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_SPINLOCK(uid_lock); -static LIST_HEAD(uid_list); -static struct proc_dir_entry *parent; - -struct uid_stat { - struct list_head link; - uid_t uid; - atomic_t tcp_rcv; - atomic_t tcp_snd; -}; - -static struct uid_stat *find_uid_stat(uid_t uid) { - unsigned long flags; - struct uid_stat *entry; - - spin_lock_irqsave(&uid_lock, flags); - list_for_each_entry(entry, &uid_list, link) { - if (entry->uid == uid) { - spin_unlock_irqrestore(&uid_lock, flags); - return entry; - } - } - spin_unlock_irqrestore(&uid_lock, flags); - return NULL; -} - -static int tcp_snd_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - unsigned int bytes; - char *p = page; - struct uid_stat *uid_entry = (struct uid_stat *) data; - if (!data) - return 0; - - bytes = (unsigned int) (atomic_read(&uid_entry->tcp_snd) + INT_MIN); - p += sprintf(p, "%u\n", bytes); - len = (p - page) - off; - *eof = (len <= count) ? 1 : 0; - *start = page + off; - return len; -} - -static int tcp_rcv_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - unsigned int bytes; - char *p = page; - struct uid_stat *uid_entry = (struct uid_stat *) data; - if (!data) - return 0; - - bytes = (unsigned int) (atomic_read(&uid_entry->tcp_rcv) + INT_MIN); - p += sprintf(p, "%u\n", bytes); - len = (p - page) - off; - *eof = (len <= count) ? 1 : 0; - *start = page + off; - return len; -} - -/* Create a new entry for tracking the specified uid. */ -static struct uid_stat *create_stat(uid_t uid) { - unsigned long flags; - char uid_s[32]; - struct uid_stat *new_uid; - struct proc_dir_entry *entry; - - /* Create the uid stat struct and append it to the list. */ - if ((new_uid = kmalloc(sizeof(struct uid_stat), GFP_KERNEL)) == NULL) - return NULL; - - new_uid->uid = uid; - /* Counters start at INT_MIN, so we can track 4GB of network traffic. */ - atomic_set(&new_uid->tcp_rcv, INT_MIN); - atomic_set(&new_uid->tcp_snd, INT_MIN); - - spin_lock_irqsave(&uid_lock, flags); - list_add_tail(&new_uid->link, &uid_list); - spin_unlock_irqrestore(&uid_lock, flags); - - sprintf(uid_s, "%d", uid); - entry = proc_mkdir(uid_s, parent); - - /* Keep reference to uid_stat so we know what uid to read stats from. */ - create_proc_read_entry("tcp_snd", S_IRUGO, entry , tcp_snd_read_proc, - (void *) new_uid); - - create_proc_read_entry("tcp_rcv", S_IRUGO, entry, tcp_rcv_read_proc, - (void *) new_uid); - - return new_uid; -} - -int uid_stat_tcp_snd(uid_t uid, int size) { - struct uid_stat *entry; - if ((entry = find_uid_stat(uid)) == NULL && - ((entry = create_stat(uid)) == NULL)) { - return -1; - } - atomic_add(size, &entry->tcp_snd); - return 0; -} - -int uid_stat_tcp_rcv(uid_t uid, int size) { - struct uid_stat *entry; - if ((entry = find_uid_stat(uid)) == NULL && - ((entry = create_stat(uid)) == NULL)) { - return -1; - } - atomic_add(size, &entry->tcp_rcv); - return 0; -} - -static int __init uid_stat_init(void) -{ - parent = proc_mkdir("uid_stat", NULL); - if (!parent) { - pr_err("uid_stat: failed to create proc entry\n"); - return -1; - } - return 0; -} - -__initcall(uid_stat_init); diff --git a/include/linux/uid_stat.h b/include/linux/uid_stat.h deleted file mode 100644 index 6bd6c4e52d17..000000000000 --- a/include/linux/uid_stat.h +++ /dev/null @@ -1,29 +0,0 @@ -/* include/linux/uid_stat.h - * - * Copyright (C) 2008-2009 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#ifndef __uid_stat_h -#define __uid_stat_h - -/* Contains definitions for resource tracking per uid. */ - -#ifdef CONFIG_UID_STAT -int uid_stat_tcp_snd(uid_t uid, int size); -int uid_stat_tcp_rcv(uid_t uid, int size); -#else -#define uid_stat_tcp_snd(uid, size) do {} while (0); -#define uid_stat_tcp_rcv(uid, size) do {} while (0); -#endif - -#endif /* _LINUX_UID_STAT_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e0921748b2c0..a1d5c67e251d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -269,7 +269,6 @@ #include #include #include -#include #include #include @@ -1284,10 +1283,6 @@ out: tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); out_nopush: release_sock(sk); - - if (copied + copied_syn) - uid_stat_tcp_snd(from_kuid(&init_user_ns, current_uid()), - copied + copied_syn); return copied + copied_syn; do_fault: @@ -1562,8 +1557,6 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (copied > 0) { tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); - uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()), - copied); } return copied; } @@ -1897,10 +1890,6 @@ skip_copy: tcp_cleanup_rbuf(sk, copied); release_sock(sk); - - if (copied > 0) - uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()), - copied); return copied; out: @@ -1909,9 +1898,6 @@ out: recv_urg: err = tcp_recv_urg(sk, msg, len, flags); - if (err > 0) - uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()), - err); goto out; recv_sndq: From 7c79aca516a51401a66b28ca68bd370ed7e9db0b Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 1 Oct 2015 10:44:36 +0530 Subject: [PATCH 0034/3220] netfilter: xt_qtaguid: seq_printf fixes Update seq_printf() usage in xt_qtaguid to align with changes from mainline commit 6798a8caaf64 "fs/seq_file: convert int seq_vprint/seq_printf/etc... returns to void". Signed-off-by: Amit Pundir --- net/netfilter/xt_qtaguid.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index 04bb081adde8..e1442bfb668d 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -2543,7 +2543,6 @@ static void pp_stats_header(struct seq_file *m) static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, int cnt_set) { - int ret; struct data_counters *cnts; tag_t tag = ts_entry->tn.tag; uid_t stat_uid = get_uid_from_tag(tag); @@ -2562,7 +2561,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, } ppi->item_index++; cnts = &ts_entry->counters; - ret = seq_printf(m, "%d %s 0x%llx %u %u " + seq_printf(m, "%d %s 0x%llx %u %u " "%llu %llu " "%llu %llu " "%llu %llu " @@ -2592,7 +2591,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets, cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes, cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets); - return ret ?: 1; + return seq_has_overflowed(m) ? -ENOSPC : 1; } static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry) From 5e00c3098bbfbf22086e7ce350cc63ea913a1c15 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 25 Apr 2016 23:55:44 +0530 Subject: [PATCH 0035/3220] android: recommended.cfg: remove CONFIG_UID_STAT Remove UID Stat driver. Change-Id: Ifc9d2c6fe27900f30e6407398f5b24222518bffc Signed-off-by: Amit Pundir --- android/configs/android-recommended.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index e4c8aaade197..35936afdcae4 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -119,7 +119,6 @@ CONFIG_TIMER_STATS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_UHID=y -CONFIG_UID_STAT=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_HIDDEV=y From 4b7de8375299c9f0e330ce521a956bc35bf1d12c Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 22 Apr 2016 00:00:14 -0700 Subject: [PATCH 0036/3220] vfs: change d_canonical_path to take two paths bug: 23904372 Change-Id: I4a686d64b6de37decf60019be1718e1d820193e6 Signed-off-by: Daniel Rosenberg --- fs/notify/inotify/inotify_user.c | 2 +- fs/sdcardfs/dentry.c | 6 +++++- include/linux/dcache.h | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index f72f3b25b3f2..e2893f17dde2 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -746,7 +746,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, /* support stacked filesystems */ if(path.dentry && path.dentry->d_op) { if (path.dentry->d_op->d_canonical_path) { - path.dentry->d_op->d_canonical_path(path.dentry, &alteredpath); + path.dentry->d_op->d_canonical_path(&path, &alteredpath); canonical_path = &alteredpath; path_put(&path); } diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index ba165ef11e27..971928ab6c21 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -172,11 +172,15 @@ static int sdcardfs_cmp_ci(const struct dentry *parent, return 1; } +static void sdcardfs_canonical_path(const struct path *path, struct path *actual_path) { + sdcardfs_get_real_lower(path->dentry, actual_path); +} + const struct dentry_operations sdcardfs_ci_dops = { .d_revalidate = sdcardfs_d_revalidate, .d_release = sdcardfs_d_release, .d_hash = sdcardfs_hash_ci, .d_compare = sdcardfs_cmp_ci, - .d_canonical_path = sdcardfs_get_real_lower, + .d_canonical_path = sdcardfs_canonical_path, }; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ebad56e31a8c..5819e9b0ee22 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -161,7 +161,7 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(struct dentry *, bool); struct inode *(*d_select_inode)(struct dentry *, unsigned); - void (*d_canonical_path)(const struct dentry *, struct path *); + void (*d_canonical_path)(const struct path *, struct path *); } ____cacheline_aligned; /* From 00ee836d89c5756b4e937eae58c9d0d1d7ffe560 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 22 Apr 2016 00:00:48 -0700 Subject: [PATCH 0037/3220] fuse: Add support for d_canonical_path Allows FUSE to report to inotify that it is acting as a layered filesystem. The userspace component returns a string representing the location of the underlying file. If the string cannot be resolved into a path, the top level path is returned instead. bug: 23904372 Change-Id: Iabdca0bbedfbff59e9c820c58636a68ef9683d9f Signed-off-by: Daniel Rosenberg --- fs/fuse/dev.c | 5 +++++ fs/fuse/dir.c | 45 +++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 1 + 4 files changed, 54 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 175fcdeabe4c..8932c06e40c1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1935,6 +1936,10 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, cs->move_pages = 0; err = copy_out_args(cs, &req->out, nbytes); + if (req->in.h.opcode == FUSE_CANONICAL_PATH) { + req->out.h.error = kern_path((char *)req->out.args[0].value, 0, + req->canonical_path); + } fuse_copy_finish(cs); spin_lock(&fpq->lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5e2e08712d3b..24df0a5df675 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -267,6 +267,50 @@ invalid: goto out; } +/* + * Get the canonical path. Since we must translate to a path, this must be done + * in the context of the userspace daemon, however, the userspace daemon cannot + * look up paths on its own. Instead, we handle the lookup as a special case + * inside of the write request. + */ +static void fuse_dentry_canonical_path(const struct path *path, struct path *canonical_path) { + struct inode *inode = path->dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + int err; + char *path_name; + + req = fuse_get_req(fc, 1); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto default_path; + + path_name = (char*)__get_free_page(GFP_KERNEL); + if (!path_name) { + fuse_put_request(fc, req); + goto default_path; + } + + req->in.h.opcode = FUSE_CANONICAL_PATH; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 0; + req->out.numargs = 1; + req->out.args[0].size = PATH_MAX; + req->out.args[0].value = path_name; + req->canonical_path = canonical_path; + req->out.argvar = 1; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + free_page((unsigned long)path_name); + if (!err) + return; +default_path: + canonical_path->dentry = path->dentry; + canonical_path->mnt = path->mnt; + path_get(canonical_path); +} + static int invalid_nodeid(u64 nodeid) { return !nodeid || nodeid == FUSE_ROOT_ID; @@ -274,6 +318,7 @@ static int invalid_nodeid(u64 nodeid) const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_canonical_path = fuse_dentry_canonical_path, }; int fuse_valid_type(int m) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 405113101db8..6b4f8dd8e9c4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -363,6 +363,9 @@ struct fuse_req { /** Inode used in the request or NULL */ struct inode *inode; + /** Path used for completing d_canonical_path */ + struct path *canonical_path; + /** AIO control block */ struct fuse_io_priv *io; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index c9aca042e61d..8485dd4300b8 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -358,6 +358,7 @@ enum fuse_opcode { FUSE_FALLOCATE = 43, FUSE_READDIRPLUS = 44, FUSE_RENAME2 = 45, + FUSE_CANONICAL_PATH= 2016, /* CUSE specific operations */ CUSE_INIT = 4096, From 5a0725f49f3ab0ebada002e1d4b13943c3a35d18 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 26 Apr 2016 15:51:06 +0530 Subject: [PATCH 0038/3220] Revert "SELinux: build fix for 4.1" This reverts commit 43e1b4f528e1654fadd1097f7cc5c50be6e45b77. This patch is part of code which is already upstreamed in v4.4. Commits 5c73fceb8c70 (SELinux: Enable setting security contexts on rootfs inodes.), 12f348b9dcf6 (SELinux: rename SE_SBLABELSUPP to SBLABEL_MNT), and b43e725d8d38 (SELinux: use a helper function to determine seclabel). for reference. Signed-off-by: Amit Pundir --- security/selinux/hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 78d06ff4eeb7..d4d12949de09 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -469,7 +469,7 @@ static int sb_finish_set_opts(struct super_block *sb) * setting SELinux context on in-core inodes. */ if (strncmp(sb->s_type->name, "rootfs", sizeof("rootfs")) == 0) - sbsec->flags |= SBLABEL_MNT; + sbsec->flags |= SE_SBLABELSUPP; /* Initialize the root inode. */ rc = inode_doinit_with_dentry(root_inode, root); From 66daa5f5e2b9078a7cecf37e896385aeee520005 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 26 Apr 2016 15:51:20 +0530 Subject: [PATCH 0039/3220] Revert "SELinux: Enable setting security contexts on rootfs inodes." This reverts commit 78d36d2111cd4ca722a602846f7db8f54a0b074c. Drop this duplicate patch. This patch is already upstreamed in v4.4. Commits 5c73fceb8c70 (SELinux: Enable setting security contexts on rootfs inodes.), 12f348b9dcf6 (SELinux: rename SE_SBLABELSUPP to SBLABEL_MNT), and b43e725d8d38 (SELinux: use a helper function to determine seclabel), for reference. Signed-off-by: Amit Pundir --- security/selinux/hooks.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d4d12949de09..9b0586d0f3c4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -464,13 +464,6 @@ static int sb_finish_set_opts(struct super_block *sb) if (selinux_is_sblabel_mnt(sb)) sbsec->flags |= SBLABEL_MNT; - /* - * Special handling for rootfs. Is genfs but supports - * setting SELinux context on in-core inodes. - */ - if (strncmp(sb->s_type->name, "rootfs", sizeof("rootfs")) == 0) - sbsec->flags |= SE_SBLABELSUPP; - /* Initialize the root inode. */ rc = inode_doinit_with_dentry(root_inode, root); From ad95c12f66df9efae04b15d5c4d0d0ba56ab2620 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Mon, 25 Apr 2016 14:28:30 -0700 Subject: [PATCH 0040/3220] Revert "mm: vmscan: Add a debug file for shrinkers" Kernel panic when type "cat /sys/kernel/debug/shrinker" Unable to handle kernel paging request at virtual address 0af37d40 pgd = d4dec000 [0af37d40] *pgd=00000000 Internal error: Oops: 5 [#1] PREEMPT SMP ARM [] (_raw_spin_lock) from [] (list_lru_count_one+0x14/0x28) [] (list_lru_count_one) from [] (super_cache_count+0x40/0xa0) [] (super_cache_count) from [] (debug_shrinker_show+0x50/0x90) [] (debug_shrinker_show) from [] (seq_read+0x1ec/0x48c) [] (seq_read) from [] (__vfs_read+0x20/0xd0) [] (__vfs_read) from [] (vfs_read+0x7c/0x104) [] (vfs_read) from [] (SyS_read+0x44/0x9c) [] (SyS_read) from [] (ret_fast_syscall+0x0/0x3c) Code: e1a04000 e3a00001 ebd66b39 f594f000 (e1943f9f) ---[ end trace 60c74014a63a9688 ]--- Kernel panic - not syncing: Fatal exception shrink_control.nid is used but not initialzed, same for shrink_control.memcg. This reverts commit b0e7a582b2264cdf75874dcd8df915b6b4427755. Change-Id: I108de88fa4baaef99a53c4e4c6a1d8c4b4804157 Reported-by: Xiaowen Liu Signed-off-by: Dmitry Shmidt --- mm/vmscan.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0c56f6a812bc..2aec4241b42a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include @@ -221,39 +220,6 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); } -struct dentry *debug_file; - -static int debug_shrinker_show(struct seq_file *s, void *unused) -{ - struct shrinker *shrinker; - struct shrink_control sc; - - sc.gfp_mask = -1; - sc.nr_to_scan = 0; - - down_read(&shrinker_rwsem); - list_for_each_entry(shrinker, &shrinker_list, list) { - int num_objs; - - num_objs = shrinker->count_objects(shrinker, &sc); - seq_printf(s, "%pf %d\n", shrinker->scan_objects, num_objs); - } - up_read(&shrinker_rwsem); - return 0; -} - -static int debug_shrinker_open(struct inode *inode, struct file *file) -{ - return single_open(file, debug_shrinker_show, inode->i_private); -} - -static const struct file_operations debug_shrinker_fops = { - .open = debug_shrinker_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* * Add a shrinker callback to be called from the vm. */ @@ -283,15 +249,6 @@ int register_shrinker(struct shrinker *shrinker) } EXPORT_SYMBOL(register_shrinker); -static int __init add_shrinker_debug(void) -{ - debugfs_create_file("shrinker", 0644, NULL, NULL, - &debug_shrinker_fops); - return 0; -} - -late_initcall(add_shrinker_debug); - /* * Remove one */ From 4e461c777e345727aa2988377774c996d303ac46 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Apr 2016 17:12:57 -0700 Subject: [PATCH 0041/3220] xt_qtaguid: Fix panic caused by synack processing In upstream commit ca6fb06518836ef9b65dc0aac02ff97704d52a05 (tcp: attach SYNACK messages to request sockets instead of listener) http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ca6fb0651883 The building of synack messages was changed, which made it so the skb->sk points to a casted request_sock. This is problematic, as there is no sk_socket in a request_sock. So when the qtaguid_mt function tries to access the sk->sk_socket, it accesses uninitialized memory. After looking at how other netfilter implementations handle this, I realized there was a skb_to_full_sk() helper added, which the xt_qtaguid code isn't yet using. This patch adds its use, and resovles panics seen when accessing uninitialzed memory when processing synack packets. Reported-by: YongQin Liu Signed-off-by: John Stultz --- net/netfilter/xt_qtaguid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index e1442bfb668d..822dc3c3bce1 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1689,7 +1689,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) /* default: Fall through and do UID releated work */ } - sk = skb->sk; + sk = skb_to_full_sk(skb); /* * When in TCP_TIME_WAIT the sk is not a "struct sock" but * "struct inet_timewait_sock" which is missing fields. From 379cf6e8ddb187ad202bf05d22251e280fee26d1 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 26 Apr 2016 15:14:35 +0530 Subject: [PATCH 0042/3220] Revert "HID: steelseries: validate output report details" This reverts commit 90037b2720acffa6da2269a10ecf24ec2dace89b. Remove duplicate code. This patch is already upstreamed in v4.4, commit 41df7f6d4372 (HID: steelseries: validate output report details). Signed-off-by: Amit Pundir --- drivers/hid/hid-steelseries.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/hid/hid-steelseries.c b/drivers/hid/hid-steelseries.c index 93ddc1c65b4c..3edd4ac36494 100644 --- a/drivers/hid/hid-steelseries.c +++ b/drivers/hid/hid-steelseries.c @@ -253,11 +253,6 @@ static int steelseries_srws1_probe(struct hid_device *hdev, goto err_free; } - if (!hid_validate_values(hdev, HID_OUTPUT_REPORT, 0, 0, 16)) { - ret = -ENODEV; - goto err_free; - } - ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) { hid_err(hdev, "hw start failed\n"); From 9fc7b46c3d3ffdd135c601e5c28a6bb316f3a011 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 26 Apr 2016 14:47:53 +0530 Subject: [PATCH 0043/3220] Revert "hid-multitouch: Filter collections by application usage." This reverts commit 0840b80cb9626906b57df54e7229db60f9aea4f2. This patch is already upstreamed in v4.4, commit 658d4aed59b3 (HID: hid-multitouch: Filter collections by application usage.), and further fixed/cleaned up afterwards in commits c2ef8f21ea8f (HID: multitouch: add support for trackpads), 76f5902aebda (HID: hid-multitouch: Simplify setup and frame synchronization) et al. By having this duplicate patch in AOSP we are doing redundant checks for Touchscreen and Touchpad devices. Signed-off-by: Amit Pundir --- drivers/hid/hid-multitouch.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 96759b270360..3d664d01305e 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -434,16 +434,6 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi, if ((usage->hid & HID_USAGE_PAGE) == HID_UP_BUTTON) td->buttons_count++; - /* Only map fields from TouchScreen or TouchPad collections. - * We need to ignore fields that belong to other collections - * such as Mouse that might have the same GenericDesktop usages. */ - if (field->application == HID_DG_TOUCHSCREEN) - set_bit(INPUT_PROP_DIRECT, hi->input->propbit); - else if (field->application == HID_DG_TOUCHPAD) - set_bit(INPUT_PROP_POINTER, hi->input->propbit); - else - return 0; - if (usage->usage_index) prev_usage = &field->usage[usage->usage_index - 1]; From 3a343a1540d4376d838c0a29bd5462d4e961e766 Mon Sep 17 00:00:00 2001 From: Yongqin Liu Date: Thu, 28 Apr 2016 13:53:36 +0800 Subject: [PATCH 0044/3220] quick selinux support for tracefs Here is just the quick fix for tracefs with selinux. just add tracefs to the list of whitelisted filesystem types in selinux_is_sblabel_mnt(), but the right fix would be to generalize this logic as described in the last item on the todo list, https://bitbucket.org/seandroid/wiki/wiki/ToDo Change-Id: I2aa803ccffbcd2802a7287514da7648e6b364157 Signed-off-by: Yongqin Liu --- security/selinux/hooks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 9b0586d0f3c4..94a0bfc748d1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -420,6 +420,7 @@ static int selinux_is_sblabel_mnt(struct super_block *sb) !strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "pstore") || !strcmp(sb->s_type->name, "debugfs") || + !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "rootfs"); } From f24888af18176246638498325fb1ecd49e0ef00e Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 2 May 2016 15:32:15 +0530 Subject: [PATCH 0045/3220] Revert "mmc: Add status IRQ and status callback function to mmc platform data" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 91fa97e1e5c001d52f6c993d37be08d1e84f47b7. This patch is no longer valid. There are no users for this status irq and callback in android-4.x. The Qcom platform (mach-msm/qsd8x50, HTC Dream..) and SDCC controller (msm_sdcc) using this status IRQ and callback are dropped from mainline sometime back. 27842bb18b00 (mmc: Remove msm_sdcc driver) c0c89fafa289 (ARM: Remove mach-msm and associated ARM architecture code) Change-Id: Ia38e42a06dc184395f79c1ec1d306bf9775704d5 Signed-off-by: Amit Pundir --- include/linux/amba/mmci.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/amba/mmci.h b/include/linux/amba/mmci.h index 8bfd21c13d59..eff56cb0016a 100644 --- a/include/linux/amba/mmci.h +++ b/include/linux/amba/mmci.h @@ -40,10 +40,7 @@ struct mmci_platform_data { int gpio_wp; int gpio_cd; bool cd_invert; - unsigned int status_irq; struct embedded_sdio_data *embedded_sdio; - int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id); - }; #endif From d114f2c3570fc63019656b6870e4fcc506852334 Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Wed, 23 Mar 2016 13:18:03 -0700 Subject: [PATCH 0046/3220] usb: dual-role: make stub functions inline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_DUAL_ROLE_USB_INTF is disabled but the exported functions are referenced, the build will result in warnings such as: In file included from include/linux/usb/class-dual-role.h:112:13: warning: ‘dual_role_instance_changed’ defined but not used [-Wunused-function] These stub functions should be static inline. Change-Id: I5a9ef58dca32306fac5a4c7f28cdaa36fa8ae078 Signed-off-by: Jack Pham (cherry picked from commit 2d152dbb0743526b21d6bbefe097f874c027f860) (cherry picked from commit 8ad66cafaa10e6ba94ff79a8dbc2cc437c6bfe93) --- include/linux/usb/class-dual-role.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/usb/class-dual-role.h b/include/linux/usb/class-dual-role.h index af42ed34944a..c6df2238012e 100644 --- a/include/linux/usb/class-dual-role.h +++ b/include/linux/usb/class-dual-role.h @@ -109,18 +109,19 @@ extern int dual_role_property_is_writeable(struct dual_role_phy_instance enum dual_role_property prop); extern void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role); #else /* CONFIG_DUAL_ROLE_USB_INTF */ -static void dual_role_instance_changed(struct dual_role_phy_instance +static inline void dual_role_instance_changed(struct dual_role_phy_instance *dual_role){} -static struct dual_role_phy_instance *__must_check +static inline struct dual_role_phy_instance *__must_check devm_dual_role_instance_register(struct device *parent, const struct dual_role_phy_desc *desc) { return ERR_PTR(-ENOSYS); } -static void devm_dual_role_instance_unregister(struct device *dev, +static inline void devm_dual_role_instance_unregister(struct device *dev, struct dual_role_phy_instance *dual_role){} -static void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role) +static inline void *dual_role_get_drvdata(struct dual_role_phy_instance + *dual_role) { return ERR_PTR(-ENOSYS); } From 8b43e279a24727794cf6dd61fcad956f532002d3 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Wed, 4 May 2016 11:05:15 +0530 Subject: [PATCH 0047/3220] Revert "ARM: Add 'card_present' state to mmc_platfrom_data" This reverts commit 541632275e983573b8250fcd4402f772d7bd1e6f. mmc_platform_data (or arch/arm/include/asm/mach/mmc.h in general) has no active user in android-4.x kernels. Also all the necessary bits are already moved to include/linux/amba/mmci.h. 6ef297f86b62 (ARM: 5720/1: Move MMCI header to amba include dir) Change-Id: Iff384eb527327bf88543408e0257241c1fd99a43 Signed-off-by: Amit Pundir --- arch/arm/include/asm/mach/mmc.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/include/asm/mach/mmc.h b/arch/arm/include/asm/mach/mmc.h index bca864ac945f..f8d391ad9203 100644 --- a/arch/arm/include/asm/mach/mmc.h +++ b/arch/arm/include/asm/mach/mmc.h @@ -18,7 +18,6 @@ struct embedded_sdio_data { struct mmc_platform_data { unsigned int ocr_mask; /* available voltages */ int built_in; /* built-in device flag */ - int card_present; /* card detect state */ u32 (*translate_vdd)(struct device *, unsigned int); unsigned int (*status)(struct device *); struct embedded_sdio_data *embedded_sdio; From b2cee99f2bada8a9da66cd29b71a79aaf52227ce Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Wed, 4 May 2016 11:14:16 +0530 Subject: [PATCH 0048/3220] Revert "Recreate asm/mach/mmc.h include file" This reverts commit 5b42ae3edab6c39c1337d36881d29350bb36dcff. This recereated arch/arm/include/asm/mach/mmc.h include file has no active user in android-4.x kernels. Also all the necessary bits are already moved to include/linux/amba/mmci.h. 6ef297f86b62 (ARM: 5720/1: Move MMCI header to amba include dir) Change-Id: Ibf258b355d17f54f49b777a8f6e0089e9b59a3a5 Signed-off-by: Amit Pundir --- arch/arm/include/asm/mach/mmc.h | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 arch/arm/include/asm/mach/mmc.h diff --git a/arch/arm/include/asm/mach/mmc.h b/arch/arm/include/asm/mach/mmc.h deleted file mode 100644 index f8d391ad9203..000000000000 --- a/arch/arm/include/asm/mach/mmc.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * arch/arm/include/asm/mach/mmc.h - */ -#ifndef ASMARM_MACH_MMC_H -#define ASMARM_MACH_MMC_H - -#include -#include -#include - -struct embedded_sdio_data { - struct sdio_cis cis; - struct sdio_cccr cccr; - struct sdio_embedded_func *funcs; - int num_funcs; -}; - -struct mmc_platform_data { - unsigned int ocr_mask; /* available voltages */ - int built_in; /* built-in device flag */ - u32 (*translate_vdd)(struct device *, unsigned int); - unsigned int (*status)(struct device *); - struct embedded_sdio_data *embedded_sdio; - int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id); -}; - -#endif From f355b21139b26c70f38ae3cc19388c513a75242c Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Wed, 4 May 2016 13:51:38 -0700 Subject: [PATCH 0049/3220] fiq_debugger: Add option to apply uart overlay by FIQ_DEBUGGER_UART_OVERLAY fiq_debugger is taking over uart, so it is necessary to disable original uart in DT file. It can be done manually or by overlay. Change-Id: I9f50ec15b0e22e602d73b9f745fc8666f8925d09 Signed-off-by: Dmitry Shmidt --- drivers/staging/android/fiq_debugger/Kconfig | 9 ++++++ .../android/fiq_debugger/fiq_debugger.c | 30 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/drivers/staging/android/fiq_debugger/Kconfig b/drivers/staging/android/fiq_debugger/Kconfig index 56f7f999377e..60fc224d4efc 100644 --- a/drivers/staging/android/fiq_debugger/Kconfig +++ b/drivers/staging/android/fiq_debugger/Kconfig @@ -42,6 +42,15 @@ config FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE If enabled, this puts the fiq debugger into console mode by default. Otherwise, the fiq debugger will start out in debug mode. +config FIQ_DEBUGGER_UART_OVERLAY + bool "Install uart DT overlay" + depends on FIQ_DEBUGGER + select OF_OVERLAY + default n + help + If enabled, fiq debugger is calling fiq_debugger_uart_overlay() + that will apply overlay uart_overlay@0 to disable proper uart. + config FIQ_WATCHDOG bool select FIQ_DEBUGGER diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c index 7f056831dbff..0c558331a3d2 100644 --- a/drivers/staging/android/fiq_debugger/fiq_debugger.c +++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c @@ -39,6 +39,10 @@ #include #endif +#ifdef CONFIG_FIQ_DEBUGGER_UART_OVERLAY +#include +#endif + #include #include "fiq_debugger.h" @@ -1201,10 +1205,36 @@ static struct platform_driver fiq_debugger_driver = { }, }; +#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY) +int fiq_debugger_uart_overlay(void) +{ + struct device_node *onp = of_find_node_by_path("/uart_overlay@0"); + int ret; + + if (!onp) { + pr_err("serial_debugger: uart overlay not found\n"); + return -ENODEV; + } + + ret = of_overlay_create(onp); + if (ret < 0) { + pr_err("serial_debugger: fail to create overlay: %d\n", ret); + of_node_put(onp); + return ret; + } + + pr_info("serial_debugger: uart overlay applied\n"); + return 0; +} +#endif + static int __init fiq_debugger_init(void) { #if defined(CONFIG_FIQ_DEBUGGER_CONSOLE) fiq_debugger_tty_init(); +#endif +#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY) + fiq_debugger_uart_overlay(); #endif return platform_driver_register(&fiq_debugger_driver); } From 0ecbd590a7175818fcbccc9d5b1dc819aae25c86 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Thu, 17 Dec 2015 07:34:27 +0800 Subject: [PATCH 0050/3220] sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() If a newly created task is selected to go to a different CPU in fork balance when it wakes up the first time, its load averages should not be removed from the source CPU since they are never added to it before. The same is also applicable to a never used group entity. Fix it in remove_entity_load_avg(): when entity's last_update_time is 0, simply return. This should precisely identify the case in question, because in other migrations, the last_update_time is set to 0 after remove_entity_load_avg(). Reported-by: Steve Muckle Signed-off-by: Yuyang Du [peterz: cfs_rq_last_update_time] Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20151216233427.GJ28098@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cfdc0e61066c..6e7d36f2ed8f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2809,6 +2809,27 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } +#ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + u64 last_update_time_copy; + u64 last_update_time; + + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); + + return last_update_time; +} +#else +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.last_update_time; +} +#endif + /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). @@ -2818,17 +2839,14 @@ void remove_entity_load_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; -#ifndef CONFIG_64BIT - u64 last_update_time_copy; + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; - do { - last_update_time_copy = cfs_rq->load_last_update_time_copy; - smp_rmb(); - last_update_time = cfs_rq->avg.last_update_time; - } while (last_update_time != last_update_time_copy); -#else - last_update_time = cfs_rq->avg.last_update_time; -#endif + last_update_time = cfs_rq_last_update_time(cfs_rq); __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); From 259bd4cd8a4c5776e86ae1a86f1f8e1168e84212 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 17 Sep 2015 16:10:56 +0100 Subject: [PATCH 0051/3220] cpufreq: Frequency invariant scheduler load-tracking support Implements cpufreq_scale_freq_capacity() to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is: current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) In fact, freq_scale should be a struct cpufreq_policy data member. But this would require that the scheduler hot path (__update_load_avg()) would have to grab the cpufreq lock. This can be avoided by using per-cpu data initialized to SCHED_CAPACITY_SCALE for freq_scale. Signed-off-by: Dietmar Eggemann --- drivers/cpufreq/cpufreq.c | 29 +++++++++++++++++++++++++++++ include/linux/cpufreq.h | 3 +++ 2 files changed, 32 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 8412ce5f93a7..46a7d62c8174 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -347,6 +347,31 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) #endif } +/********************************************************************* + * FREQUENCY INVARIANT CPU CAPACITY * + *********************************************************************/ + +static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; + +static void +scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) +{ + unsigned long cur = freqs ? freqs->new : policy->cur; + unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max; + int cpu; + + pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n", + cpumask_pr_args(policy->cpus), cur, policy->max, scale); + + for_each_cpu(cpu, policy->cpus) + per_cpu(freq_scale, cpu) = scale; +} + +unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(freq_scale, cpu); +} + static void __cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { @@ -450,6 +475,8 @@ wait: spin_unlock(&policy->transition_lock); + scale_freq_capacity(policy, freqs); + cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin); @@ -2126,6 +2153,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY, new_policy); + scale_freq_capacity(new_policy, NULL); + policy->min = new_policy->min; policy->max = new_policy->max; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 177c7680c1a8..11c65f536b6c 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -616,4 +616,7 @@ unsigned int cpufreq_generic_get(unsigned int cpu); int cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +struct sched_domain; +unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); #endif /* _LINUX_CPUFREQ_H */ From 372b62344c227283acd23d6ca63543a4ceb27abb Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 23 Sep 2015 12:47:48 +0100 Subject: [PATCH 0052/3220] arm: Enable frequency invariant scheduler load-tracking support Defines arch_scale_freq_capacity() to use cpufreq implementation. Signed-off-by: Dietmar Eggemann --- arch/arm/include/asm/topology.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 370f7a732900..a69917b7d2c9 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,11 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_CPU_FREQ +#include +#define arch_scale_freq_capacity cpufreq_scale_freq_capacity +#endif + #else static inline void init_cpu_topology(void) { } From 8411396e9c5cd1badfe05e08624d90cbbb7b1126 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 25 Sep 2015 17:15:11 +0100 Subject: [PATCH 0053/3220] arm64: Enable frequency invariant scheduler load-tracking support Defines arch_scale_freq_capacity() to use cpufreq implementation. Including in topology.h like for the arm arch doesn't work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0). That's why cpufreq_scale_freq_capacity() has to be declared extern in topology.h. Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index a3e9d6fdbf21..72c47c3fa7b3 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -22,6 +22,12 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_CPU_FREQ +#define arch_scale_freq_capacity cpufreq_scale_freq_capacity +struct sched_domain; +extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +#endif + #include #endif /* _ASM_ARM_TOPOLOGY_H */ From 0f8edc82743fd749a3536c68386c89112e760792 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 14 Apr 2015 16:25:31 +0100 Subject: [PATCH 0054/3220] arm: Update arch_scale_cpu_capacity() to reflect change to define arch_scale_cpu_capacity() is no longer a weak function but a #define instead. Include the #define in topology.h. cc: Russell King Signed-off-by: Morten Rasmussen --- arch/arm/include/asm/topology.h | 2 ++ arch/arm/kernel/topology.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index a69917b7d2c9..e3e596cbb1a7 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -28,6 +28,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu); #include #define arch_scale_freq_capacity cpufreq_scale_freq_capacity #endif +#define arch_scale_cpu_capacity scale_cpu_capacity +extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); #else diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847bf912..614554765e44 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -42,7 +42,7 @@ */ static DEFINE_PER_CPU(unsigned long, cpu_scale); -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { return per_cpu(cpu_scale, cpu); } From 1eb2b8aa45840b50cd953052315a8b4f6bc6416d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 7 May 2015 18:46:15 +0100 Subject: [PATCH 0055/3220] sched: Store system-wide maximum cpu capacity in root domain To be able to compare the capacity of the target cpu with the highest cpu capacity of the system in the wakeup path, store the system-wide maximum cpu capacity in the root domain. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 8 ++++++++ kernel/sched/sched.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 732e993b564b..7c6e2b326349 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6971,6 +6971,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -7021,11 +7022,18 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); + + if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity) + rq->rd->max_cpu_capacity = rq->cpu_capacity_orig; } rcu_read_unlock(); + if (rq) + pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity); + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b242775bf670..2b7ffa5a20ad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -543,6 +543,9 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + /* Maximum cpu capacity in the system. */ + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; From e8bcb272bc920d875c26ca2e06c872b9f844f6f1 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 19:53:49 +0100 Subject: [PATCH 0056/3220] sched: Add cpu capacity awareness to wakeup balancing Wakeup balancing is completely unaware of cpu capacity, cpu utilization and task utilization. The task is preferably placed on a cpu which is idle in the instant the wakeup happens. New tasks (SD_BALANCE_{FORK,EXEC} are placed on an idle cpu in the idlest group if such can be found, otherwise it goes on the least loaded one. Existing tasks (SD_BALANCE_WAKE) are placed on the previous cpu or an idle cpu sharing the same last level cache unless the wakee_flips heuristic in wake_wide() decides to fallback to considering cpus outside SD_LLC. Hence existing tasks are not guaranteed to get a chance to migrate to a different group at wakeup in case the current one has reduced cpu capacity (due RT/IRQ pressure or different uarch e.g. ARM big.LITTLE). They may eventually get pulled by other cpus doing periodic/idle/nohz_idle balance, but it may take quite a while before it happens. This patch adds capacity awareness to find_idlest_{group,queue} (used by SD_BALANCE_{FORK,EXEC} and SD_BALANCE_WAKE under certain circumstances) such that groups/cpus that can accommodate the waking task based on task utilization are preferred. In addition, wakeup of existing tasks (SD_BALANCE_WAKE) is sent through find_idlest_{group,queue} also if the task doesn't fit the capacity of the previous cpu to allow it to escape (override wake_affine) when necessary instead of relying on periodic/idle/nohz_idle balance to eventually sort it out. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e7d36f2ed8f..5e112bba1b5a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4742,6 +4742,43 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) return 1; } +static inline unsigned long task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +static unsigned int capacity_margin = 1280; /* ~20% margin */ + +static inline bool __task_fits(struct task_struct *p, int cpu, int util) +{ + unsigned long capacity = capacity_of(cpu); + + util += task_util(p); + + return (capacity * 1024) > (util * capacity_margin); +} + +static inline bool task_fits_max(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_of(cpu); + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + + if (capacity == max_capacity) + return true; + + if (capacity * capacity_margin > max_capacity * 1024) + return true; + + return __task_fits(p, cpu, 0); +} + +static int cpu_util(int cpu); + +static inline bool task_fits_spare(struct task_struct *p, int cpu) +{ + return __task_fits(p, cpu, cpu_util(cpu)); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -4751,7 +4788,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; + struct sched_group *fit_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long fit_capacity = ULONG_MAX; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4782,6 +4821,15 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + /* + * Look for most energy-efficient group that can fit + * that can fit the task. + */ + if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { + fit_capacity = capacity_of(i); + fit_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4795,6 +4843,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } } while (group = group->next, group != sd->groups); + if (fit_group) + return fit_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; @@ -4815,7 +4866,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (idle_cpu(i)) { + if (task_fits_spare(p, i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -4827,7 +4878,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if ((!idle || idle->exit_latency == min_exit_latency) && + } else if (idle_cpu(i) && + (!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -4836,6 +4888,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; + } else if (shallowest_idle_cpu == -1) { + /* + * If we haven't found an idle CPU yet + * pick a non-idle one that can fit the task as + * fallback. + */ + shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); @@ -4950,7 +5009,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { From 8a5c0339bb7b7eedd2e75b62ebe4144a195f7ef3 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 6 Jul 2015 15:01:10 +0100 Subject: [PATCH 0057/3220] sched: Consider spare cpu capacity at task wake-up find_idlest_group() selects the wake-up target group purely based on group load which leads to suboptimal choices in low load scenarios. An idle group with reduced capacity (due to RT tasks or different cpu type) isn't necessarily a better target than a lightly loaded group with higher capacity. The patch adds spare capacity as an additional group selection parameter. The target group is now selected based on the following criteria: 1. Return the group with the cpu with most spare capacity and this capacity is significant if such group exists. Significant spare capacity is currently at least 20% to spare. 2. Return the group with the lowest load, unless it is the local group in which case NULL is returned and the search is continued at the next (lower) level. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e112bba1b5a..bebc8367edee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4788,9 +4788,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL; + struct sched_group *fit_group = NULL, *spare_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; unsigned long fit_capacity = ULONG_MAX; + unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4798,7 +4799,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, spare_capacity; int local_group; int i; @@ -4830,6 +4831,16 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, fit_capacity = capacity_of(i); fit_group = group; } + + /* + * Look for group which has most spare capacity on a + * single cpu. + */ + spare_capacity = capacity_of(i) - cpu_util(i); + if (spare_capacity > max_spare_capacity) { + max_spare_capacity = spare_capacity; + spare_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4846,6 +4857,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (fit_group) return fit_group; + if (spare_group) + return spare_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; From cda2bd333be6e5e8c38fa072c9c4c296312dfd8d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 26 Jan 2015 19:47:28 +0000 Subject: [PATCH 0058/3220] sched: Enable idle balance to pull single task towards cpu with higher capacity We do not want to miss out on the ability to pull a single remaining task from a potential source cpu towards an idle destination cpu. Add an extra criteria to need_active_balance() to kick off active load balance if the source cpu is over-utilized and has lower capacity than the destination cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bebc8367edee..fe40cdb8e640 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4779,6 +4779,11 @@ static inline bool task_fits_spare(struct task_struct *p, int cpu) return __task_fits(p, cpu, cpu_util(cpu)); } +static bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -6995,6 +7000,13 @@ static int need_active_balance(struct lb_env *env) return 1; } + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } From 681fa1413b869ca73f685b9c8769cdd645879804 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 2 Jul 2015 17:16:34 +0100 Subject: [PATCH 0059/3220] sched: Prevent unnecessary active balance of single task in sched group Scenarios with the busiest group having just one task and the local being idle on topologies with sched groups with different numbers of cpus manage to dodge all load-balance bailout conditions resulting the nr_balance_failed counter to be incremented. This eventually causes a pointless active migration of the task. This patch prevents this by not incrementing the counter when the busiest group only has one task. ASYM_PACKING migrations and migrations due to reduced capacity should still take place as these are explicitly captured by need_active_balance(). A better solution would be to not attempt the load-balance in the first place, but that requires significant changes to the order of bailout conditions and statistics gathering. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe40cdb8e640..632163ef7628 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5629,6 +5629,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -6591,6 +6592,8 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + if (!env->sd->parent) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) @@ -7219,7 +7222,8 @@ more_balance: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); From d8c8088f41033b1cea9d88149046e1f8bdb9ade0 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:43:28 +0000 Subject: [PATCH 0060/3220] sched: Documentation for scheduler energy cost model This documentation patch provides an overview of the experimental scheduler energy costing model, associated data structures, and a reference recipe on how platforms can be characterized to derive energy models. Signed-off-by: Morten Rasmussen --- Documentation/scheduler/sched-energy.txt | 362 +++++++++++++++++++++++ 1 file changed, 362 insertions(+) create mode 100644 Documentation/scheduler/sched-energy.txt diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt new file mode 100644 index 000000000000..dab2f9088b33 --- /dev/null +++ b/Documentation/scheduler/sched-energy.txt @@ -0,0 +1,362 @@ +Energy cost model for energy-aware scheduling (EXPERIMENTAL) + +Introduction +============= + +The basic energy model uses platform energy data stored in sched_group_energy +data structures attached to the sched_groups in the sched_domain hierarchy. The +energy cost model offers two functions that can be used to guide scheduling +decisions: + +1. static unsigned int sched_group_energy(struct energy_env *eenv) +2. static int energy_diff(struct energy_env *eenv) + +sched_group_energy() estimates the energy consumed by all cpus in a specific +sched_group including any shared resources owned exclusively by this group of +cpus. Resources shared with other cpus are excluded (e.g. later level caches). + +energy_diff() estimates the total energy impact of a utilization change. That +is, adding, removing, or migrating utilization (tasks). + +Both functions use a struct energy_env to specify the scenario to be evaluated: + + struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; + }; + +sg_top: sched_group to be evaluated. Not used by energy_diff(). + +sg_cap: sched_group covering the cpus in the same frequency domain. Set by +sched_group_energy(). + +cap_idx: Capacity state to be used for energy calculations. Set by +find_new_capacity(). + +util_delta: Amount of utilization to be added, removed, or migrated. + +src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be +-1 if no source (e.g. task wake-up). + +dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1 +if utilization is removed (e.g. terminating tasks). + +energy: Result of sched_group_energy(). + +The metric used to represent utilization is the actual per-entity running time +averaged over time using a geometric series. Very similar to the existing +per-entity load-tracking, but _not_ scaled by task priority and capped by the +capacity of the cpu. The latter property does mean that utilization may +underestimate the compute requirements for task on fully/over utilized cpus. +The greatest potential for energy savings without affecting performance too much +is scenarios where the system isn't fully utilized. If the system is deemed +fully utilized load-balancing should be done with task load (includes task +priority) instead in the interest of fairness and performance. + + +Background and Terminology +=========================== + +To make it clear from the start: + +energy = [joule] (resource like a battery on powered devices) +power = energy/time = [joule/second] = [watt] + +The goal of energy-aware scheduling is to minimize energy, while still getting +the job done. That is, we want to maximize: + + performance [inst/s] + -------------------- + power [W] + +which is equivalent to minimizing: + + energy [J] + ----------- + instruction + +while still getting 'good' performance. It is essentially an alternative +optimization objective to the current performance-only objective for the +scheduler. This alternative considers two objectives: energy-efficiency and +performance. Hence, there needs to be a user controllable knob to switch the +objective. Since it is early days, this is currently a sched_feature +(ENERGY_AWARE). + +The idea behind introducing an energy cost model is to allow the scheduler to +evaluate the implications of its decisions rather than applying energy-saving +techniques blindly that may only have positive effects on some platforms. At +the same time, the energy cost model must be as simple as possible to minimize +the scheduler latency impact. + +Platform topology +------------------ + +The system topology (cpus, caches, and NUMA information, not peripherals) is +represented in the scheduler by the sched_domain hierarchy which has +sched_groups attached at each level that covers one or more cpus (see +sched-domains.txt for more details). To add energy awareness to the scheduler +we need to consider power and frequency domains. + +Power domain: + +A power domain is a part of the system that can be powered on/off +independently. Power domains are typically organized in a hierarchy where you +may be able to power down just a cpu or a group of cpus along with any +associated resources (e.g. shared caches). Powering up a cpu means that all +power domains it is a part of in the hierarchy must be powered up. Hence, it is +more expensive to power up the first cpu that belongs to a higher level power +domain than powering up additional cpus in the same high level domain. Two +level power domain hierarchy example: + + Power source + +-------------------------------+----... +per group PD G G + | +----------+ | + +--------+-------| Shared | (other groups) +per-cpu PD G G | resource | + | | +----------+ + +-------+ +-------+ + | CPU 0 | | CPU 1 | + +-------+ +-------+ + +Frequency domain: + +Frequency domains (P-states) typically cover the same group of cpus as one of +the power domain levels. That is, there might be several smaller power domains +sharing the same frequency (P-state) or there might be a power domain spanning +multiple frequency domains. + +From a scheduling point of view there is no need to know the actual frequencies +[Hz]. All the scheduler cares about is the compute capacity available at the +current state (P-state) the cpu is in and any other available states. For that +reason, and to also factor in any cpu micro-architecture differences, compute +capacity scaling states are called 'capacity states' in this document. For SMP +systems this is equivalent to P-states. For mixed micro-architecture systems +(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture +performance relative to the other cpus in the system. + +Energy modelling: +------------------ + +Due to the hierarchical nature of the power domains, the most obvious way to +model energy costs is therefore to associate power and energy costs with +domains (groups of cpus). Energy costs of shared resources are associated with +the group of cpus that share the resources, only the cost of powering the +cpu itself and any private resources (e.g. private L1 caches) is associated +with the per-cpu groups (lowest level). + +For example, for an SMP system with per-cpu power domains and a cluster level +(group of cpus) power domain we get the overall energy costs to be: + + energy = energy_cluster + n * energy_cpu + +where 'n' is the number of cpus powered up and energy_cluster is the cost paid +as soon as any cpu in the cluster is powered up. + +The power and frequency domains can naturally be mapped onto the existing +sched_domain hierarchy and sched_groups by adding the necessary data to the +existing data structures. + +The energy model considers energy consumption from two contributors (shown in +the illustration below): + +1. Busy energy: Energy consumed while a cpu and the higher level groups that it +belongs to are busy running tasks. Busy energy is associated with the state of +the cpu, not an event. The time the cpu spends in this state varies. Thus, the +most obvious platform parameter for this contribution is busy power +(energy/time). + +2. Idle energy: Energy consumed while a cpu and higher level groups that it +belongs to are idle (in a C-state). Like busy energy, idle energy is associated +with the state of the cpu. Thus, the platform parameter for this contribution +is idle power (energy/time). + +Energy consumed during transitions from an idle-state (C-state) to a busy state +(P-state) or going the other way is ignored by the model to simplify the energy +model calculations. + + + Power + ^ + | busy->idle idle->busy + | transition transition + | + | _ __ + | / \ / \__________________ + |______________/ \ / + | \ / + | Busy \ Idle / Busy + | low P-state \____________/ high P-state + | + +------------------------------------------------------------> time + +Busy |--------------| |-----------------| + +Wakeup |------| |------| + +Idle |------------| + + +The basic algorithm +==================== + +The basic idea is to determine the total energy impact when utilization is +added or removed by estimating the impact at each level in the sched_domain +hierarchy starting from the bottom (sched_group contains just a single cpu). +The energy cost comes from busy time (sched_group is awake because one or more +cpus are busy) and idle time (in an idle-state). Energy model numbers account +for energy costs associated with all cpus in the sched_group as a group. + + for_each_domain(cpu, sd) { + sg = sched_group_of(cpu) + energy_before = curr_util(sg) * busy_power(sg) + + (1-curr_util(sg)) * idle_power(sg) + energy_after = new_util(sg) * busy_power(sg) + + (1-new_util(sg)) * idle_power(sg) + energy_diff += energy_before - energy_after + + } + + return energy_diff + +{curr, new}_util: The cpu utilization at the lowest level and the overall +non-idle time for the entire group for higher levels. Utilization is in the +range 0.0 to 1.0 in the pseudo-code. + +busy_power: The power consumption of the sched_group. + +idle_power: The power consumption of the sched_group when idle. + +Note: It is a fundamental assumption that the utilization is (roughly) scale +invariant. Task utilization tracking factors in any frequency scaling and +performance scaling differences due to difference cpu microarchitectures such +that task utilization can be used across the entire system. + + +Platform energy data +===================== + +struct sched_group_energy can be attached to sched_groups in the sched_domain +hierarchy and has the following members: + +cap_states: + List of struct capacity_state representing the supported capacity states + (P-states). struct capacity_state has two members: cap and power, which + represents the compute capacity and the busy_power of the state. The + list must be ordered by capacity low->high. + +nr_cap_states: + Number of capacity states in cap_states list. + +idle_states: + List of struct idle_state containing idle_state power cost for each + idle-state supported by the system orderd by shallowest state first. + All states must be included at all level in the hierarchy, i.e. a + sched_group spanning just a single cpu must also include coupled + idle-states (cluster states). In addition to the cpuidle idle-states, + the list must also contain an entry for the idling using the arch + default idle (arch_idle_cpu()). Despite this state may not be a true + hardware idle-state it is considered the shallowest idle-state in the + energy model and must be the first entry. cpus may enter this state + (possibly 'active idling') if cpuidle decides not enter a cpuidle + idle-state. Default idle may not be used when cpuidle is enabled. + In this case, it should just be a copy of the first cpuidle idle-state. + +nr_idle_states: + Number of idle states in idle_states list. + +There are no unit requirements for the energy cost data. Data can be normalized +with any reference, however, the normalization must be consistent across all +energy cost data. That is, one bogo-joule/watt must be the same quantity for +data, but we don't care what it is. + +A recipe for platform characterization +======================================= + +Obtaining the actual model data for a particular platform requires some way of +measuring power/energy. There isn't a tool to help with this (yet). This +section provides a recipe for use as reference. It covers the steps used to +characterize the ARM TC2 development platform. This sort of measurements is +expected to be done anyway when tuning cpuidle and cpufreq for a given +platform. + +The energy model needs two types of data (struct sched_group_energy holds +these) for each sched_group where energy costs should be taken into account: + +1. Capacity state information + +A list containing the compute capacity and power consumption when fully +utilized attributed to the group as a whole for each available capacity state. +At the lowest level (group contains just a single cpu) this is the power of the +cpu alone without including power consumed by resources shared with other cpus. +It basically needs to fit the basic modelling approach described in "Background +and Terminology" section: + + energy_system = energy_shared + n * energy_cpu + +for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at +the lowest level. 'energy_shared' is included at the next level which +represents the group of cpus among which the resources are shared. + +This model is, of course, a simplification of reality. Thus, power/energy +attributions might not always exactly represent how the hardware is designed. +Also, busy power is likely to depend on the workload. It is therefore +recommended to use a representative mix of workloads when characterizing the +capacity states. + +If the group has no capacity scaling support, the list will contain a single +state where power is the busy power attributed to the group. The capacity +should be set to a default value (1024). + +When frequency domains include multiple power domains, the group representing +the frequency domain and all child groups share capacity states. This must be +indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at +all levels that share the capacity state must have the list of capacity states +with the power set to the contribution of the individual group. + +2. Idle power information + +Stored in the idle_states list. The power number is the group idle power +consumption in each idle state as well when the group is idle but has not +entered an idle-state ('active idle' as mentioned earlier). Due to the way the +energy model is defined, the idle power of the deepest group idle state can +alternatively be accounted for in the parent group busy power. In that case the +group idle state power values are offset such that the idle power of the +deepest state is zero. It is less intuitive, but it is easier to measure as +idle power consumed by the group and the busy/idle power of the parent group +cannot be distinguished without per group measurement points. + +Measuring capacity states and idle power: + +The capacity states' capacity and power can be estimated by running a benchmark +workload at each available capacity state. By restricting the benchmark to run +on subsets of cpus it is possible to extrapolate the power consumption of +shared resources. + +ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a +shared L2 cache. TC2 has on-chip energy counters per cluster. Running a +benchmark workload on just one cpu in a cluster means that power is consumed in +the cluster (higher level group) and a single cpu (lowest level group). Adding +another benchmark task to another cpu increases the power consumption by the +amount consumed by the additional cpu. Hence, it is possible to extrapolate the +cluster busy power. + +For platforms that don't have energy counters or equivalent instrumentation +built-in, it may be possible to use an external DAQ to acquire similar data. + +If the benchmark includes some performance score (for example sysbench cpu +benchmark), this can be used to record the compute capacity. + +Measuring idle power requires insight into the idle state implementation on the +particular platform. Specifically, if the platform has coupled idle-states (or +package states). To measure non-coupled per-cpu idle-states it is necessary to +keep one cpu busy to keep any shared resources alive to isolate the idle power +of the cpu from idle/busy power of the shared resources. The cpu can be tricked +into different per-cpu idle states by disabling the other states. Based on +various combinations of measurements with specific cpus busy and disabling +idle-states it is possible to extrapolate the idle-state power. From e496f328c87173fe188e91c6cd60197fb8e7e305 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:45:51 +0000 Subject: [PATCH 0061/3220] sched: Make energy awareness a sched feature This patch introduces the ENERGY_AWARE sched feature, which is implemented using jump labels when SCHED_DEBUG is defined. It is statically set false when SCHED_DEBUG is not defined. Hence this doesn't allow energy awareness to be enabled without SCHED_DEBUG. This sched_feature knob will be replaced later with a more appropriate control knob when things have matured a bit. ENERGY_AWARE is based on per-entity load-tracking hence FAIR_GROUP_SCHED must be enable. This dependency isn't checked at compile time yet. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 +++++ kernel/sched/features.h | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 632163ef7628..efc65c3db6b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4651,6 +4651,11 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa46c2f..b634151ce286 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -69,3 +69,8 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +/* + * Energy aware scheduling. Use platform energy model to guide scheduling + * decisions optimizing for energy efficiency. + */ +SCHED_FEAT(ENERGY_AWARE, false) From 0b3bda54d245ebcfeefc61f0ff763e76ca2a8784 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:08:45 +0000 Subject: [PATCH 0062/3220] sched: Introduce energy data structures The struct sched_group_energy represents the per sched_group related data which is needed for energy aware scheduling. It contains: (1) number of elements of the idle state array (2) pointer to the idle state array which comprises 'power consumption' for each idle state (3) number of elements of the capacity state array (4) pointer to the capacity state array which comprises 'compute capacity and power consumption' tuples for each capacity state The struct sched_group obtains a pointer to a struct sched_group_energy. The function pointer sched_domain_energy_f is introduced into struct sched_domain_topology_level which will allow the arch to pass a particular struct sched_group_energy from the topology shim layer into the scheduler core. The function pointer sched_domain_energy_f has an 'int cpu' parameter since the folding of two adjacent sd levels via sd degenerate doesn't work for all sd levels. I.e. it is not possible for example to use this feature to provide per-cpu energy in sd level DIE on ARM's TC2 platform. It was discussed that the folding of sd levels approach is preferable over the cpu parameter approach, simply because the user (the arch specifying the sd topology table) can introduce less errors. But since it is not working, the 'int cpu' parameter is the only way out. It's possible to use the folding of sd levels approach for sched_domain_flags_f and the cpu parameter approach for the sched_domain_energy_f at the same time though. With the use of the 'int cpu' parameter, an extra check function has to be provided to make sure that all cpus spanned by a sched group are provisioned with the same energy data. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- include/linux/sched.h | 19 +++++++++++++++++++ kernel/sched/sched.h | 1 + 2 files changed, 20 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index fa39434e3fdd..5bee272a86d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1021,6 +1021,22 @@ struct sched_domain_attr { extern int sched_domain_level_max; +struct capacity_state { + unsigned long cap; /* compute capacity */ + unsigned long power; /* power consumption at this compute capacity */ +}; + +struct idle_state { + unsigned long power; /* power consumption in this idle state */ +}; + +struct sched_group_energy { + unsigned int nr_idle_states; /* number of idle states */ + struct idle_state *idle_states; /* ptr to idle state array */ + unsigned int nr_cap_states; /* number of capacity states */ + struct capacity_state *cap_states; /* ptr to capacity state array */ +}; + struct sched_group; struct sched_domain { @@ -1119,6 +1135,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); +typedef +const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu); #define SDTL_OVERLAP 0x01 @@ -1131,6 +1149,7 @@ struct sd_data { struct sched_domain_topology_level { sched_domain_mask_f mask; sched_domain_flags_f sd_flags; + sched_domain_energy_f energy; int flags; int numa_level; struct sd_data data; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2b7ffa5a20ad..1813cba2995d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -863,6 +863,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; + const struct sched_group_energy const *sge; /* * The CPUs this group covers. From 4ce990ec65231af6a148c2c0e6b4745fc1d12368 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:20:20 +0000 Subject: [PATCH 0063/3220] sched: Initialize energy data structures The sched_group_energy (sge) pointer of the first sched_group (sg) in the sched_domain (sd) is initialized to point to the appropriate (in terms of sd level and cpu) sge data defined in the arch and so to the correct part of the Energy Model (EM). Energy-aware scheduling allows that a system has only EM data up to a certain sd level (so called highest energy aware balancing sd level). A check in init_sched_energy() enforces that all sd's below this sd level contain EM data. The 'int cpu' parameter of sched_domain_energy_f requires that check_sched_energy_data() makes sure that all cpus spanned by a sg are provisioned with the same EM data. This patch has also been tested with feature FORCE_SD_OVERLAP enabled. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 65 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7c6e2b326349..f1b1c9eaeb10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6304,6 +6304,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } +/* + * Check that the per-cpu provided sd energy data is consistent for all cpus + * within the mask. + */ +static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn, + const struct cpumask *cpumask) +{ + const struct sched_group_energy * const sge = fn(cpu); + struct cpumask mask; + int i; + + if (cpumask_weight(cpumask) <= 1) + return; + + cpumask_xor(&mask, cpumask, get_cpu_mask(cpu)); + + for_each_cpu(i, &mask) { + const struct sched_group_energy * const e = fn(i); + int y; + + BUG_ON(e->nr_idle_states != sge->nr_idle_states); + + for (y = 0; y < (e->nr_idle_states); y++) { + BUG_ON(e->idle_states[y].power != + sge->idle_states[y].power); + } + + BUG_ON(e->nr_cap_states != sge->nr_cap_states); + + for (y = 0; y < (e->nr_cap_states); y++) { + BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap); + BUG_ON(e->cap_states[y].power != + sge->cap_states[y].power); + } + } +} + +static void init_sched_energy(int cpu, struct sched_domain *sd, + sched_domain_energy_f fn) +{ + if (!(fn && fn(cpu))) + return; + + if (cpu != group_balance_cpu(sd->groups)) + return; + + if (sd->child && !sd->child->groups->sge) { + pr_err("BUG: EAS setup broken for CPU%d\n", cpu); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" energy data on %s but not on %s domain\n", + sd->name, sd->child->name); +#endif + return; + } + + check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups)); + + sd->groups->sge = fn(cpu); +} + /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -7010,10 +7070,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { + struct sched_domain_topology_level *tl = sched_domain_topology; + if (!cpumask_test_cpu(i, cpu_map)) continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) { + init_sched_energy(i, sd, tl->energy); claim_allocations(i, sd); init_sched_groups_capacity(i, sd); } From f0f739d887a4f144ab4937e619288d4cede2cc91 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:50:46 +0000 Subject: [PATCH 0064/3220] sched: Introduce SD_SHARE_CAP_STATES sched_domain flag cpufreq is currently keeping it a secret which cpus are sharing clock source. The scheduler needs to know about clock domains as well to become more energy aware. The SD_SHARE_CAP_STATES domain flag indicates whether cpus belonging to the sched_domain share capacity states (P-states). There is no connection with cpufreq (yet). The flag must be set by the arch specific topology code. cc: Russell King cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- arch/arm/kernel/topology.c | 3 ++- include/linux/sched.h | 1 + kernel/sched/core.c | 10 +++++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 614554765e44..38e7be162b79 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -277,7 +277,8 @@ void store_cpu_topology(unsigned int cpuid) static inline int cpu_corepower_flags(void) { - return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN; + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; } static struct sched_domain_topology_level arm_topology[] = { diff --git a/include/linux/sched.h b/include/linux/sched.h index 5bee272a86d8..09e2d43406eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -989,6 +989,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_NUMA 0x4000 /* cross-node balancing */ +#define SD_SHARE_CAP_STATES 0x8000 /* Domain members share capacity state */ #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1b1c9eaeb10..2863d223c07e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5775,7 +5775,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5807,7 +5808,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -6472,6 +6474,7 @@ static int sched_domains_curr_level; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_SHARE_CAP_STATES - describes shared capacity states * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -6481,7 +6484,8 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, int cpu) From c7aeeb88c7370af2fc4e2e8bd156e001f7a96d11 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 10 Jul 2015 13:57:19 +0100 Subject: [PATCH 0065/3220] arm: Cpu invariant scheduler load-tracking and capacity support Provides the scheduler with a cpu scaling correction factor for more accurate load-tracking and cpu capacity handling. The Energy Model (EM) (in fact the capacity value of the last element of the capacity states vector of the core (MC) level sched_group_energy structure) is used instead of the arm arch specific cpu_efficiency and dtb property 'clock-frequency' values as the source for this cpu scaling factor. The cpu capacity value depends on the micro-architecture and the maximum frequency of the cpu. The maximum frequency part should not be confused with the frequency invariant scheduler load-tracking support which deals with frequency related scaling due to DFVS functionality. Signed-off-by: Juri Lelli Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 38e7be162b79..da1c611a3b5e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -153,6 +153,8 @@ static void __init parse_dt_topology(void) } +static const struct sched_group_energy * const cpu_core_energy(int cpu); + /* * Look for a customed capacity of a CPU in the cpu_capacity table during the * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the @@ -160,10 +162,14 @@ static void __init parse_dt_topology(void) */ static void update_cpu_capacity(unsigned int cpu) { - if (!cpu_capacity(cpu)) - return; + unsigned long capacity = SCHED_CAPACITY_SCALE; - set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } + + set_capacity_scale(cpu, capacity); pr_info("CPU%u: update cpu_capacity %lu\n", cpu, arch_scale_cpu_capacity(NULL, cpu)); From 7c791bf110c2c1884f02f0c4ceaff886fad2a907 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 30 Apr 2015 11:53:48 +0100 Subject: [PATCH 0066/3220] arm64: Cpu invariant scheduler load-tracking and capacity support Provides the scheduler with a cpu scaling correction factor for more accurate load-tracking and cpu capacity handling. The Energy Model (EM) (in fact the capacity value of the last element of the capacity states vector of the core (MC) level sched_group_energy structure) is used as the source for this cpu scaling factor. The cpu capacity value depends on the micro-architecture and the maximum frequency of the cpu. The maximum frequency part should not be confused with the frequency invariant scheduler load-tracking support which deals with frequency related scaling due to DFVS functionality. Signed-off-by: Juri Lelli Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 4 ++- arch/arm64/kernel/topology.c | 42 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 72c47c3fa7b3..9370a336c934 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -22,11 +22,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +struct sched_domain; #ifdef CONFIG_CPU_FREQ #define arch_scale_freq_capacity cpufreq_scale_freq_capacity -struct sched_domain; extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); #endif +#define arch_scale_cpu_capacity scale_cpu_capacity +extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); #include diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 694f6deedbab..fb99a6735fd4 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -23,6 +23,18 @@ #include #include +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; + +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) +{ + per_cpu(cpu_scale, cpu) = capacity; +} + static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; @@ -211,6 +223,35 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return &cpu_topology[cpu].core_sibling; } +static inline int cpu_corepower_flags(void) +{ + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; +} + +static struct sched_domain_topology_level arm64_topology[] = { +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +static void update_cpu_capacity(unsigned int cpu) +{ + unsigned long capacity = SCHED_CAPACITY_SCALE; + + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } + + set_capacity_scale(cpu, capacity); + + pr_info("CPU%d: update cpu_capacity %lu\n", + cpu, arch_scale_cpu_capacity(NULL, cpu)); +} + static void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; @@ -272,6 +313,7 @@ void store_cpu_topology(unsigned int cpuid) topology_populated: update_siblings_masks(cpuid); + update_cpu_capacity(cpuid); } static void __init reset_cpu_topology(void) From 3e55d2f2fcaf44ac19eb143d895924ff402e375c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 14:11:28 +0000 Subject: [PATCH 0067/3220] sched: Compute cpu capacity available at current frequency capacity_orig_of() returns the max available compute capacity of a cpu. For scale-invariant utilization tracking and energy-aware scheduling decisions it is useful to know the compute capacity available at the current OPP of a cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index efc65c3db6b5..ea966d9193cd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4651,6 +4651,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); From b6c0399e0f340887f369c6870ba6a865afe0b0ec Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 11 Dec 2014 15:25:29 +0000 Subject: [PATCH 0068/3220] sched: Relocated cpu_util() and change return type Move cpu_util() to an earlier position in fair.c and change return type to unsigned long as negative usage doesn't make much sense. All other load and capacity related functions use unsigned long including the caller of cpu_util(). cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 70 ++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea966d9193cd..318141042a3e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4662,6 +4662,40 @@ static unsigned long capacity_curr_of(int cpu) >> SCHED_CAPACITY_SHIFT; } +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static unsigned long cpu_util(int cpu) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + + return (util >= capacity) ? capacity : util; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); @@ -4788,8 +4822,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } -static int cpu_util(int cpu); - static inline bool task_fits_spare(struct task_struct *p, int cpu) { return __task_fits(p, cpu, cpu_util(cpu)); @@ -4988,40 +5020,6 @@ done: return target; } -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static int cpu_util(int cpu) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); - - return (util >= capacity) ? capacity : util; -} - /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, From 5ec8ccabfef3989f4514f22c034a15aeab691110 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 2 Jan 2015 17:08:52 +0000 Subject: [PATCH 0069/3220] sched: Highest energy aware balancing sched_domain level pointer Add another member to the family of per-cpu sched_domain shortcut pointers. This one, sd_ea, points to the highest level at which energy model is provided. At this level and all levels below all sched_groups have energy model data attached. Partial energy model information is possible but restricted to providing energy model data for lower level sched_domains (sd_ea and below) and leaving load-balancing on levels above to non-energy-aware load-balancing. For example, it is possible to apply energy-aware scheduling within each socket on a multi-socket system and let normal scheduling handle load-balancing between sockets. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 11 ++++++++++- kernel/sched/sched.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2863d223c07e..45e54b0e3669 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5993,11 +5993,12 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_PER_CPU(struct sched_domain *, sd_ea); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; + struct sched_domain *busy_sd = NULL, *ea_sd = NULL; int id = cpu; int size = 1; @@ -6018,6 +6019,14 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); + + for_each_domain(cpu, sd) { + if (sd->groups->sge) + ea_sd = sd; + else + break; + } + rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1813cba2995d..a77516a15ba0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -839,6 +839,7 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +DECLARE_PER_CPU(struct sched_domain *, sd_ea); struct sched_group_capacity { atomic_t ref; From c1770a521398102b6f2447502035887297b17ea0 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 18 Dec 2014 14:47:18 +0000 Subject: [PATCH 0070/3220] sched: Calculate energy consumption of sched_group For energy-aware load-balancing decisions it is necessary to know the energy consumption estimates of groups of cpus. This patch introduces a basic function, sched_group_energy(), which estimates the energy consumption of the cpus in the group and any resources shared by the members of the group. NOTE: The function has five levels of identation and breaks the 80 character limit. Refactoring is necessary. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 4 ++ kernel/sched/fair.c | 156 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + 3 files changed, 161 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 45e54b0e3669..bc5bb26ccf3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5994,6 +5994,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); DEFINE_PER_CPU(struct sched_domain *, sd_ea); +DEFINE_PER_CPU(struct sched_domain *, sd_scs); static void update_top_cache_domain(int cpu) { @@ -6027,6 +6028,9 @@ static void update_top_cache_domain(int cpu) break; } rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); + + sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES); + rcu_assign_pointer(per_cpu(sd_scs, cpu), sd); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 318141042a3e..49f7567c6b07 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4701,6 +4701,162 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +/* + * cpu_norm_util() returns the cpu util relative to a specific capacity, + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for + * energy calculations. Using the scale-invariant util returned by + * cpu_util() and approximating scale-invariant util by: + * + * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time + * + * the normalized util can be found using the specific capacity. + * + * capacity = capacity_orig * curr_freq/max_freq + * + * norm_util = running_time/time ~ util/capacity + */ +static unsigned long cpu_norm_util(int cpu, unsigned long capacity) +{ + int util = cpu_util(cpu); + + if (util >= capacity) + return SCHED_CAPACITY_SCALE; + + return (util << SCHED_CAPACITY_SHIFT)/capacity; +} + +static unsigned long group_max_util(struct sched_group *sg) +{ + int i; + unsigned long max_util = 0; + + for_each_cpu(i, sched_group_cpus(sg)) + max_util = max(max_util, cpu_util(i)); + + return max_util; +} + +/* + * group_norm_util() returns the approximated group util relative to it's + * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in + * energy calculations. Since task executions may or may not overlap in time in + * the group the true normalized util is between max(cpu_norm_util(i)) and + * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The + * latter is used as the estimate as it leads to a more pessimistic energy + * estimate (more busy). + */ +static unsigned long group_norm_util(struct sched_group *sg, int cap_idx) +{ + int i; + unsigned long util_sum = 0; + unsigned long capacity = sg->sge->cap_states[cap_idx].cap; + + for_each_cpu(i, sched_group_cpus(sg)) + util_sum += cpu_norm_util(i, capacity); + + if (util_sum > SCHED_CAPACITY_SCALE) + return SCHED_CAPACITY_SCALE; + return util_sum; +} + +static int find_new_capacity(struct sched_group *sg, + const struct sched_group_energy const *sge) +{ + int idx; + unsigned long util = group_max_util(sg); + + for (idx = 0; idx < sge->nr_cap_states; idx++) { + if (sge->cap_states[idx].cap >= util) + return idx; + } + + return idx; +} + +/* + * sched_group_energy(): Computes the absolute energy consumption of cpus + * belonging to the sched_group including shared resources shared only by + * members of the group. Iterates over all cpus in the hierarchy below the + * sched_group starting from the bottom working it's way up before going to + * the next cpu until all cpus are covered at all levels. The current + * implementation is likely to gather the same util statistics multiple times. + * This can probably be done in a faster but more complex way. + * Note: sched_group_energy() may fail when racing with sched_domain updates. + */ +static int sched_group_energy(struct sched_group *sg_top) +{ + struct sched_domain *sd; + int cpu, total_energy = 0; + struct cpumask visit_cpus; + struct sched_group *sg; + + WARN_ON(!sg_top->sge); + + cpumask_copy(&visit_cpus, sched_group_cpus(sg_top)); + + while (!cpumask_empty(&visit_cpus)) { + struct sched_group *sg_shared_cap = NULL; + + cpu = cpumask_first(&visit_cpus); + + /* + * Is the group utilization affected by cpus outside this + * sched_group? + */ + sd = rcu_dereference(per_cpu(sd_scs, cpu)); + + if (!sd) + /* + * We most probably raced with hotplug; returning a + * wrong energy estimation is better than entering an + * infinite loop. + */ + return -EINVAL; + + if (sd->parent) + sg_shared_cap = sd->parent->groups; + + for_each_domain(cpu, sd) { + sg = sd->groups; + + /* Has this sched_domain already been visited? */ + if (sd->child && group_first_cpu(sg) != cpu) + break; + + do { + struct sched_group *sg_cap_util; + unsigned long group_util; + int sg_busy_energy, sg_idle_energy, cap_idx; + + if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) + sg_cap_util = sg_shared_cap; + else + sg_cap_util = sg; + + cap_idx = find_new_capacity(sg_cap_util, sg->sge); + group_util = group_norm_util(sg, cap_idx); + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) + >> SCHED_CAPACITY_SHIFT; + + total_energy += sg_busy_energy + sg_idle_energy; + + if (!sd->child) + cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); + + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top))) + goto next_cpu; + + } while (sg = sg->next, sg != sd->groups); + } +next_cpu: + continue; + } + + return total_energy; +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a77516a15ba0..a618db2936d4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -840,6 +840,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); DECLARE_PER_CPU(struct sched_domain *, sd_ea); +DECLARE_PER_CPU(struct sched_domain *, sd_scs); struct sched_group_capacity { atomic_t ref; From df2030c8412e6acdb3f7ec62ed195cc7967c30e2 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 2 Jan 2015 14:21:56 +0000 Subject: [PATCH 0071/3220] sched: Extend sched_group_energy to test load-balancing decisions Extended sched_group_energy() to support energy prediction with usage (tasks) added/removed from a specific cpu or migrated between a pair of cpus. Useful for load-balancing decision making. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 90 +++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 49f7567c6b07..f3de37414a66 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4688,12 +4688,21 @@ static unsigned long capacity_curr_of(int cpu) * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). */ -static unsigned long cpu_util(int cpu) +static unsigned long __cpu_util(int cpu, int delta) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - return (util >= capacity) ? capacity : util; + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); } static inline bool energy_aware(void) @@ -4701,8 +4710,18 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; +}; + /* - * cpu_norm_util() returns the cpu util relative to a specific capacity, + * __cpu_norm_util() returns the cpu util relative to a specific capacity, * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for * energy calculations. Using the scale-invariant util returned by * cpu_util() and approximating scale-invariant util by: @@ -4715,9 +4734,9 @@ static inline bool energy_aware(void) * * norm_util = running_time/time ~ util/capacity */ -static unsigned long cpu_norm_util(int cpu, unsigned long capacity) +static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) { - int util = cpu_util(cpu); + int util = __cpu_util(cpu, delta); if (util >= capacity) return SCHED_CAPACITY_SCALE; @@ -4725,13 +4744,25 @@ static unsigned long cpu_norm_util(int cpu, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static unsigned long group_max_util(struct sched_group *sg) +static int calc_util_delta(struct energy_env *eenv, int cpu) { - int i; + if (cpu == eenv->src_cpu) + return -eenv->util_delta; + if (cpu == eenv->dst_cpu) + return eenv->util_delta; + return 0; +} + +static +unsigned long group_max_util(struct energy_env *eenv) +{ + int i, delta; unsigned long max_util = 0; - for_each_cpu(i, sched_group_cpus(sg)) - max_util = max(max_util, cpu_util(i)); + for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { + delta = calc_util_delta(eenv, i); + max_util = max(max_util, __cpu_util(i, delta)); + } return max_util; } @@ -4745,31 +4776,36 @@ static unsigned long group_max_util(struct sched_group *sg) * latter is used as the estimate as it leads to a more pessimistic energy * estimate (more busy). */ -static unsigned long group_norm_util(struct sched_group *sg, int cap_idx) +static unsigned +long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i; + int i, delta; unsigned long util_sum = 0; - unsigned long capacity = sg->sge->cap_states[cap_idx].cap; + unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; - for_each_cpu(i, sched_group_cpus(sg)) - util_sum += cpu_norm_util(i, capacity); + for_each_cpu(i, sched_group_cpus(sg)) { + delta = calc_util_delta(eenv, i); + util_sum += __cpu_norm_util(i, capacity, delta); + } if (util_sum > SCHED_CAPACITY_SCALE) return SCHED_CAPACITY_SCALE; return util_sum; } -static int find_new_capacity(struct sched_group *sg, +static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy const *sge) { int idx; - unsigned long util = group_max_util(sg); + unsigned long util = group_max_util(eenv); for (idx = 0; idx < sge->nr_cap_states; idx++) { if (sge->cap_states[idx].cap >= util) - return idx; + break; } + eenv->cap_idx = idx; + return idx; } @@ -4783,16 +4819,16 @@ static int find_new_capacity(struct sched_group *sg, * This can probably be done in a faster but more complex way. * Note: sched_group_energy() may fail when racing with sched_domain updates. */ -static int sched_group_energy(struct sched_group *sg_top) +static int sched_group_energy(struct energy_env *eenv) { struct sched_domain *sd; int cpu, total_energy = 0; struct cpumask visit_cpus; struct sched_group *sg; - WARN_ON(!sg_top->sge); + WARN_ON(!eenv->sg_top->sge); - cpumask_copy(&visit_cpus, sched_group_cpus(sg_top)); + cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top)); while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; @@ -4824,17 +4860,16 @@ static int sched_group_energy(struct sched_group *sg_top) break; do { - struct sched_group *sg_cap_util; unsigned long group_util; int sg_busy_energy, sg_idle_energy, cap_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) - sg_cap_util = sg_shared_cap; + eenv->sg_cap = sg_shared_cap; else - sg_cap_util = sg; + eenv->sg_cap = sg; - cap_idx = find_new_capacity(sg_cap_util, sg->sge); - group_util = group_norm_util(sg, cap_idx); + cap_idx = find_new_capacity(eenv, sg->sge); + group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) >> SCHED_CAPACITY_SHIFT; sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) @@ -4845,7 +4880,7 @@ static int sched_group_energy(struct sched_group *sg_top) if (!sd->child) cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); - if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top))) + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top))) goto next_cpu; } while (sg = sg->next, sg != sd->groups); @@ -4854,7 +4889,8 @@ next_cpu: continue; } - return total_energy; + eenv->energy = total_energy; + return 0; } /* From 2c6a8a48a701ca5d767f249eacb776f80858cbaa Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 6 Jan 2015 17:34:05 +0000 Subject: [PATCH 0072/3220] sched: Estimate energy impact of scheduling decisions Adds a generic energy-aware helper function, energy_diff(), that calculates energy impact of adding, removing, and migrating utilization in the system. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f3de37414a66..043715612227 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4893,6 +4893,58 @@ next_cpu: return 0; } +static inline bool cpu_in_sg(struct sched_group *sg, int cpu) +{ + return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); +} + +/* + * energy_diff(): Estimate the energy impact of changing the utilization + * distribution. eenv specifies the change: utilisation amount, source, and + * destination cpu. Source or destination cpu may be -1 in which case the + * utilization is removed from or added to the system (e.g. task wake-up). If + * both are specified, the utilization is migrated. + */ +static int energy_diff(struct energy_env *eenv) +{ + struct sched_domain *sd; + struct sched_group *sg; + int sd_cpu = -1, energy_before = 0, energy_after = 0; + + struct energy_env eenv_before = { + .util_delta = 0, + .src_cpu = eenv->src_cpu, + .dst_cpu = eenv->dst_cpu, + }; + + if (eenv->src_cpu == eenv->dst_cpu) + return 0; + + sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); + + if (!sd) + return 0; /* Error */ + + sg = sd->groups; + + do { + if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { + eenv_before.sg_top = eenv->sg_top = sg; + + if (sched_group_energy(&eenv_before)) + return 0; /* Invalid result abort */ + energy_before += eenv_before.energy; + + if (sched_group_energy(eenv)) + return 0; /* Invalid result abort */ + energy_after += eenv->energy; + } + } while (sg = sg->next, sg != sd->groups); + + return energy_after-energy_before; +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened From 1b5ec5d8ab918a62446f91ddfdb3eba88ea8c8dd Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 16:49:57 +0100 Subject: [PATCH 0073/3220] sched: Add over-utilization/tipping point indicator Energy-aware scheduling is only meant to be active while the system is _not_ over-utilized. That is, there are spare cycles available to shift tasks around based on their actual utilization to get a more energy-efficient task distribution without depriving any tasks. When above the tipping point task placement is done the traditional way based on load_avg, spreading the tasks across as many cpus as possible based on priority scaled load to preserve smp_nice. Below the tipping point we want to use util_avg instead. We need to define a criteria for when we make the switch. The util_avg for each cpu converges towards 100% (1024) regardless of how many task additional task we may put on it. If we define over-utilized as: sum_{cpus}(rq.cfs.avg.util_avg) + margin > sum_{cpus}(rq.capacity) some individual cpus may be over-utilized running multiple tasks even when the above condition is false. That should be okay as long as we try to spread the tasks out to avoid per-cpu over-utilization as much as possible and if all tasks have the _same_ priority. If the latter isn't true, we have to consider priority to preserve smp_nice. For example, we could have n_cpus nice=-10 util_avg=55% tasks and n_cpus/2 nice=0 util_avg=60% tasks. Balancing based on util_avg we are likely to end up with nice=-10 tasks sharing cpus and nice=0 tasks getting their own as we 1.5*n_cpus tasks in total and 55%+55% is less over-utilized than 55%+60% for those cpus that have to be shared. The system utilization is only 85% of the system capacity, but we are breaking smp_nice. To be sure not to break smp_nice, we have defined over-utilization conservatively as when any cpu in the system is fully utilized at it's highest frequency instead: cpu_rq(any).cfs.avg.util_avg + margin > cpu_rq(any).capacity IOW, as soon as one cpu is (nearly) 100% utilized, we switch to load_avg to factor in priority to preserve smp_nice. With this definition, we can skip periodic load-balance as no cpu has an always-running task when the system is not over-utilized. All tasks will be periodic and we can balance them at wake-up. This conservative condition does however mean that some scenarios that could benefit from energy-aware decisions even if one cpu is fully utilized would not get those benefits. For system where some cpus might have reduced capacity on some cpus (RT-pressure and/or big.LITTLE), we want periodic load-balance checks as soon a just a single cpu is fully utilized as it might one of those with reduced capacity and in that case we want to migrate it. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 31 +++++++++++++++++++++++++------ kernel/sched/sched.h | 3 +++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 043715612227..3cfb4be3d88f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,6 +4144,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static bool cpu_overutilized(int cpu); + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4154,6 +4156,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int task_new = !(flags & ENQUEUE_WAKEUP); for_each_sched_entity(se) { if (se->on_rq) @@ -4185,9 +4188,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { add_nr_running(rq, 1); - + if (!task_new && !rq->rd->overutilized && + cpu_overutilized(rq->cpu)) + rq->rd->overutilized = true; + } hrtick_update(rq); } @@ -6651,11 +6657,12 @@ group_type group_classify(struct sched_group *group, * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. + * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload) + bool *overload, bool *overutilized) { unsigned long load; int i; @@ -6685,6 +6692,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; + + if (cpu_overutilized(i)) + *overutilized = true; } /* Adjust by relative CPU capacity of the group */ @@ -6790,7 +6800,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false; + bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6812,7 +6822,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + &overload, &overutilized); if (local_group) goto next_group; @@ -6856,8 +6866,14 @@ next_group: /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; - } + /* Update over-utilization (tipping point, U >= 0) indicator */ + if (env->dst_rq->rd->overutilized != overutilized) + env->dst_rq->rd->overutilized = overutilized; + } else { + if (!env->dst_rq->rd->overutilized && overutilized) + env->dst_rq->rd->overutilized = true; + } } /** @@ -8250,6 +8266,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) + rq->rd->overutilized = true; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a618db2936d4..5323d4fc1109 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -528,6 +528,9 @@ struct root_domain { /* Indicate more than one runnable task for any CPU */ bool overload; + /* Indicate one or more cpus over-utilized (tipping point) */ + bool overutilized; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). From 19a5ebe13dded57ab2b04defd2e379579da05197 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 27 Jan 2015 13:48:07 +0000 Subject: [PATCH 0074/3220] sched, cpuidle: Track cpuidle state index in the scheduler The idle-state of each cpu is currently pointed to by rq->idle_state but there isn't any information in the struct cpuidle_state that can used to look up the idle-state energy model data stored in struct sched_group_energy. For this purpose is necessary to store the idle state index as well. Ideally, the idle-state data should be unified. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- drivers/cpuidle/cpuidle.c | 4 ++-- include/linux/cpuidle.h | 2 +- kernel/sched/idle.c | 3 ++- kernel/sched/sched.h | 21 +++++++++++++++++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 17a6dc0e2111..b9a6cceb7bac 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -192,7 +192,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, } /* Take note of the planned idle state. */ - sched_idle_set_state(target_state); + sched_idle_set_state(target_state, index); trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ktime_get(); @@ -205,7 +205,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ - sched_idle_set_state(NULL); + sched_idle_set_state(NULL, -1); if (broadcast) { if (WARN_ON_ONCE(!irqs_disabled())) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 786ad32631a6..6eae1576499e 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -204,7 +204,7 @@ static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, #endif /* kernel/sched/idle.c */ -extern void sched_idle_set_state(struct cpuidle_state *idle_state); +extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index); extern void default_idle_call(void); #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..cbc130efbc5b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -19,9 +19,10 @@ * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. */ -void sched_idle_set_state(struct cpuidle_state *idle_state) +void sched_idle_set_state(struct cpuidle_state *idle_state, int index) { idle_set_state(this_rq(), idle_state); + idle_set_state_idx(this_rq(), index); } static int __read_mostly cpu_idle_force_poll; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5323d4fc1109..2301b0476b90 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -693,6 +693,7 @@ struct rq { #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; + int idle_state_idx; #endif }; @@ -1285,6 +1286,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) WARN_ON(!rcu_read_lock_held()); return rq->idle_state; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ + rq->idle_state_idx = idle_state_idx; +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state_idx; +} #else static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) @@ -1295,6 +1307,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + return -1; +} #endif extern void sysrq_sched_debug_show(void); From ec055b978e851118a2b9d42f61ecad619cd1ce77 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 27 Jan 2015 14:04:17 +0000 Subject: [PATCH 0075/3220] sched: Determine the current sched_group idle-state To estimate the energy consumption of a sched_group in sched_group_energy() it is necessary to know which idle-state the group is in when it is idle. For now, it is assumed that this is the current idle-state (though it might be wrong). Based on the individual cpu idle-states group_idle_state() finds the group idle-state. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3cfb4be3d88f..c11bc7392916 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4815,6 +4815,20 @@ static int find_new_capacity(struct energy_env *eenv, return idx; } +static int group_idle_state(struct sched_group *sg) +{ + int i, state = INT_MAX; + + /* Find the shallowest idle state in the sched group. */ + for_each_cpu(i, sched_group_cpus(sg)) + state = min(state, idle_get_state_idx(cpu_rq(i))); + + /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ + state++; + + return state; +} + /* * sched_group_energy(): Computes the absolute energy consumption of cpus * belonging to the sched_group including shared resources shared only by @@ -4867,7 +4881,8 @@ static int sched_group_energy(struct energy_env *eenv) do { unsigned long group_util; - int sg_busy_energy, sg_idle_energy, cap_idx; + int sg_busy_energy, sg_idle_energy; + int cap_idx, idle_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) eenv->sg_cap = sg_shared_cap; @@ -4875,11 +4890,13 @@ static int sched_group_energy(struct energy_env *eenv) eenv->sg_cap = sg; cap_idx = find_new_capacity(eenv, sg->sge); + idle_idx = group_idle_state(sg); group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) - >> SCHED_CAPACITY_SHIFT; - sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) - >> SCHED_CAPACITY_SHIFT; + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) + * sg->sge->idle_states[idle_idx].power) + >> SCHED_CAPACITY_SHIFT; total_energy += sg_busy_energy + sg_idle_energy; From e38982ba3289974385bd8fead9bf767d8cdde386 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 20:03:19 +0100 Subject: [PATCH 0076/3220] sched: Energy-aware wake-up task placement Let available compute capacity and estimated energy impact select wake-up target cpu when energy-aware scheduling is enabled and the system in not over-utilized (above the tipping point). energy_aware_wake_cpu() attempts to find group of cpus with sufficient compute capacity to accommodate the task and find a cpu with enough spare capacity to handle the task within that group. Preference is given to cpus with enough spare capacity at the current OPP. Finally, the energy impact of the new target and the previous task cpu is compared to select the wake-up target cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 89 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c11bc7392916..682b4ae9ebd7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5287,6 +5287,86 @@ done: return target; } +static int energy_aware_wake_cpu(struct task_struct *p, int target) +{ + struct sched_domain *sd; + struct sched_group *sg, *sg_target; + int target_max_cap = INT_MAX; + int target_cpu = task_cpu(p); + int i; + + sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); + + if (!sd) + return target; + + sg = sd->groups; + sg_target = sg; + + /* + * Find group with sufficient capacity. We only get here if no cpu is + * overutilized. We may end up overutilizing a cpu by adding the task, + * but that should not be any worse than select_idle_sibling(). + * load_balance() should sort it out later as we get above the tipping + * point. + */ + do { + /* Assuming all cpus are the same in group */ + int max_cap_cpu = group_first_cpu(sg); + + /* + * Assume smaller max capacity means more energy-efficient. + * Ideally we should query the energy model for the right + * answer but it easily ends up in an exhaustive search. + */ + if (capacity_of(max_cap_cpu) < target_max_cap && + task_fits_max(p, max_cap_cpu)) { + sg_target = sg; + target_max_cap = capacity_of(max_cap_cpu); + } + } while (sg = sg->next, sg != sd->groups); + + /* Find cpu with sufficient capacity */ + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + int new_util = cpu_util(i) + task_util(p); + + if (new_util > capacity_orig_of(i)) + continue; + + if (new_util < capacity_curr_of(i)) { + target_cpu = i; + if (cpu_rq(i)->nr_running) + break; + } + + /* cpu has capacity at higher OPP, keep it as fallback */ + if (target_cpu == task_cpu(p)) + target_cpu = i; + } + + if (target_cpu != task_cpu(p)) { + struct energy_env eenv = { + .util_delta = task_util(p), + .src_cpu = task_cpu(p), + .dst_cpu = target_cpu, + }; + + /* Not enough spare capacity on previous cpu */ + if (cpu_overutilized(task_cpu(p))) + return target_cpu; + + if (energy_diff(&eenv) >= 0) + return task_cpu(p); + } + + return target_cpu; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5309,8 +5389,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && task_fits_max(p, cpu) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || + energy_aware(); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5340,7 +5421,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } if (!sd) { - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) + new_cpu = energy_aware_wake_cpu(p, prev_cpu); + else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); } else while (sd) { From 52b7b8a37c3da55885a9b7e60b2c4db6cf2dbeff Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sun, 10 May 2015 15:17:32 +0100 Subject: [PATCH 0077/3220] sched: Consider a not over-utilized energy-aware system as balanced In case the system operates below the tipping point indicator, introduced in ("sched: Add over-utilization/tipping point indicator"), bail out in find_busiest_group after the dst and src group statistics have been checked. There is simply no need to move usage around because all involved cpus still have spare cycles available. For an energy-aware system below its tipping point, we rely on the task placement of the wakeup path. This works well for short running tasks. The existence of long running tasks on one of the involved cpus lets the system operate over its tipping point. To be able to move such a task (whose load can't be used to average the load among the cpus) from a src cpu with lower capacity than the dst_cpu, an additional rule has to be implemented in need_active_balance. Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 682b4ae9ebd7..ae4f1ee9cf0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7194,6 +7194,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (energy_aware() && !env->dst_rq->rd->overutilized) + goto out_balanced; + local = &sds.local_stat; busiest = &sds.busiest_stat; From b71188bd2c52b6af022d687347eb4c7c29901580 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 3 Feb 2015 13:54:11 +0000 Subject: [PATCH 0078/3220] sched: Disable energy-unfriendly nohz kicks With energy-aware scheduling enabled nohz_kick_needed() generates many nohz idle-balance kicks which lead to nothing when multiple tasks get packed on a single cpu to save energy. This causes unnecessary wake-ups and hence wastes energy. Make these conditions depend on !energy_aware() for now until the energy-aware nohz story gets sorted out. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ae4f1ee9cf0c..7327221c991e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8259,12 +8259,13 @@ static inline bool nohz_kick_needed(struct rq *rq) if (time_before(now, nohz.next_balance)) return false; - if (rq->nr_running >= 2) + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { + if (sd && !energy_aware()) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); From 1b35232614f555cbeed30a357ecf77a37632c347 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Mon, 29 Jun 2015 17:56:20 +0100 Subject: [PATCH 0079/3220] Documentation: DT bindings for energy model cost data required by EAS EAS (energy aware scheduling) provides the scheduler with an alternative objective - energy efficiency - as opposed to it's current performance oriented objectives. EAS relies on a simple platform energy cost model to guide scheduling decisions. The model only considers the CPU subsystem. This patch adds documentation describing DT bindings that should be used to supply the scheduler with an energy cost model. Signed-off-by: Robin Randhawa --- .../bindings/scheduler/sched-energy-costs.txt | 360 ++++++++++++++++++ 1 file changed, 360 insertions(+) create mode 100644 Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt diff --git a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt new file mode 100644 index 000000000000..11216f09e596 --- /dev/null +++ b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt @@ -0,0 +1,360 @@ +=========================================================== +Energy cost bindings for Energy Aware Scheduling +=========================================================== + +=========================================================== +1 - Introduction +=========================================================== + +This note specifies bindings required for energy-aware scheduling +(EAS)[1]. Historically, the scheduler's primary objective has been +performance. EAS aims to provide an alternative objective - energy +efficiency. EAS relies on a simple platform energy cost model to +guide scheduling decisions. The model only considers the CPU +subsystem. + +This note is aligned with the definition of the layout of physical +CPUs in the system as described in the ARM topology binding +description [2]. The concept is applicable to any system so long as +the cost model data is provided for those processing elements in +that system's topology that EAS is required to service. + +Processing elements refer to hardware threads, CPUs and clusters of +related CPUs in increasing order of hierarchy. + +EAS requires two key cost metrics - busy costs and idle costs. Busy +costs comprise of a list of compute capacities for the processing +element in question and the corresponding power consumption at that +capacity. Idle costs comprise of a list of power consumption values +for each idle state [C-state] that the processing element supports. +For a detailed description of these metrics, their derivation and +their use see [3]. + +These cost metrics are required for processing elements in all +scheduling domain levels that EAS is required to service. + +=========================================================== +2 - energy-costs node +=========================================================== + +Energy costs for the processing elements in scheduling domains that +EAS is required to service are defined in the energy-costs node +which acts as a container for the actual per processing element cost +nodes. A single energy-costs node is required for a given system. + +- energy-costs node + + Usage: Required + + Description: The energy-costs node is a container node and + it's sub-nodes describe costs for each processing element at + all scheduling domain levels that EAS is required to + service. + + Node name must be "energy-costs". + + The energy-costs node's parent node must be the cpus node. + + The energy-costs node's child nodes can be: + + - one or more cost nodes. + + Any other configuration is considered invalid. + +The energy-costs node can only contain a single type of child node +whose bindings are described in paragraph 4. + +=========================================================== +3 - energy-costs node child nodes naming convention +=========================================================== + +energy-costs child nodes must follow a naming convention where the +node name must be "thread-costN", "core-costN", "cluster-costN" +depending on whether the costs in the node are for a thread, core or +cluster. N (where N = {0, 1, ...}) is the node number and has no +bearing to the OS' logical thread, core or cluster index. + +=========================================================== +4 - cost node bindings +=========================================================== + +Bindings for cost nodes are defined as follows: + +- cluster-cost node + + Description: must be declared within an energy-costs node. A + system can contain multiple clusters and each cluster + serviced by EAS must have a corresponding cluster-costs + node. + + The cluster-cost node name must be "cluster-costN" as + described in 3 above. + + A cluster-cost node must be a leaf node with no children. + + Properties for cluster-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +- core-cost node + + Description: must be declared within an energy-costs node. A + system can contain multiple cores and each core serviced by + EAS must have a corresponding core-cost node. + + The core-cost node name must be "core-costN" as described in + 3 above. + + A core-cost node must be a leaf node with no children. + + Properties for core-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +- thread-cost node + + Description: must be declared within an energy-costs node. A + system can contain cores with multiple hardware threads and + each thread serviced by EAS must have a corresponding + thread-cost node. + + The core-cost node name must be "core-costN" as described in + 3 above. + + A core-cost node must be a leaf node with no children. + + Properties for thread-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +=========================================================== +5 - Cost node properties +========================================================== + +All cost node types must have only the following properties: + +- busy-cost-data + + Usage: required + Value type: An array of 2-item tuples. Each item is of type + u32. + Definition: The first item in the tuple is the capacity + value as described in [3]. The second item in the tuple is + the energy cost value as described in [3]. + +- idle-cost-data + + Usage: required + Value type: An array of 1-item tuples. The item is of type + u32. + Definition: The item in the tuple is the energy cost value + as described in [3]. + +=========================================================== +4 - Extensions to the cpu node +=========================================================== + +The cpu node is extended with a property that establishes the +connection between the processing element represented by the cpu +node and the cost-nodes associated with this processing element. + +The connection is expressed in line with the topological hierarchy +that this processing element belongs to starting with the level in +the hierarchy that this processing element itself belongs to through +to the highest level that EAS is required to service. The +connection cannot be sparse and must be contiguous from the +processing element's level through to the highest desired level. The +highest desired level must be the same for all processing elements. + +Example: Given that a cpu node may represent a thread that is a part +of a core, this property may contain multiple elements which +associate the thread with cost nodes describing the costs for the +thread itself, the core the thread belongs to, the cluster the core +belongs to and so on. The elements must be ordered from the lowest +level nodes to the highest desired level that EAS must service. The +highest desired level must be the same for all cpu nodes. The +elements must not be sparse: there must be elements for the current +thread, the next level of hierarchy (core) and so on without any +'holes'. + +Example: Given that a cpu node may represent a core that is a part +of a cluster of related cpus this property may contain multiple +elements which associate the core with cost nodes describing the +costs for the core itself, the cluster the core belongs to and so +on. The elements must be ordered from the lowest level nodes to the +highest desired level that EAS must service. The highest desired +level must be the same for all cpu nodes. The elements must not be +sparse: there must be elements for the current thread, the next +level of hierarchy (core) and so on without any 'holes'. + +If the system comprises of hierarchical clusters of clusters, this +property will contain multiple associations with the relevant number +of cluster elements in hierarchical order. + +Property added to the cpu node: + +- sched-energy-costs + + Usage: required + Value type: List of phandles + Definition: a list of phandles to specific cost nodes in the + energy-costs parent node that correspond to the processing + element represented by this cpu node in hierarchical order + of topology. + + The order of phandles in the list is significant. The first + phandle is to the current processing element's own cost + node. Subsequent phandles are to higher hierarchical level + cost nodes up until the maximum level that EAS is to + service. + + All cpu nodes must have the same highest level cost node. + + The phandle list must not be sparsely populated with handles + to non-contiguous hierarchical levels. See commentary above + for clarity. + + Any other configuration is invalid. + +=========================================================== +5 - Example dts +=========================================================== + +Example 1 (ARM 64-bit, 6-cpu system, two clusters of cpus, one +cluster of 2 Cortex-A57 cpus, one cluster of 4 Cortex-A53 cpus): + +cpus { + #address-cells = <2>; + #size-cells = <0>; + . + . + . + A57_0: cpu@0 { + compatible = "arm,cortex-a57","arm,armv8"; + reg = <0x0 0x0>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A57_L2>; + clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; + }; + + A57_1: cpu@1 { + compatible = "arm,cortex-a57","arm,armv8"; + reg = <0x0 0x1>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A57_L2>; + clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; + }; + + A53_0: cpu@100 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x100>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_1: cpu@101 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x101>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_2: cpu@102 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x102>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_3: cpu@103 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x103>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + energy-costs { + CPU_COST_0: core-cost0 { + busy-cost-data = < + 417 168 + 579 251 + 744 359 + 883 479 + 1024 616 + >; + idle-cost-data = < + 15 + 0 + >; + }; + CPU_COST_1: core-cost1 { + busy-cost-data = < + 235 33 + 302 46 + 368 61 + 406 76 + 447 93 + >; + idle-cost-data = < + 6 + 0 + >; + }; + CLUSTER_COST_0: cluster-cost0 { + busy-cost-data = < + 417 24 + 579 32 + 744 43 + 883 49 + 1024 64 + >; + idle-cost-data = < + 65 + 24 + >; + }; + CLUSTER_COST_1: cluster-cost1 { + busy-cost-data = < + 235 26 + 303 30 + 368 39 + 406 47 + 447 57 + >; + idle-cost-data = < + 56 + 17 + >; + }; + }; +}; + +=============================================================================== +[1] https://lkml.org/lkml/2015/5/12/728 +[2] Documentation/devicetree/bindings/topology.txt +[3] Documentation/scheduler/sched-energy.txt From d2b3db032e34938c970b148e71cc401fc4c541e8 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Mon, 29 Jun 2015 18:01:58 +0100 Subject: [PATCH 0080/3220] sched: Support for extracting EAS energy costs from DT This patch implements support for extracting energy cost data from DT. The data should conform to the DT bindings for energy cost data needed by EAS (energy aware scheduling). Signed-off-by: Robin Randhawa --- include/linux/sched_energy.h | 36 ++++++++++ kernel/sched/Makefile | 2 +- kernel/sched/energy.c | 124 +++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 include/linux/sched_energy.h create mode 100644 kernel/sched/energy.c diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h new file mode 100644 index 000000000000..a3f1627ac609 --- /dev/null +++ b/include/linux/sched_energy.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_SCHED_ENERGY_H +#define _LINUX_SCHED_ENERGY_H + +#include +#include + +/* + * There doesn't seem to be an NR_CPUS style max number of sched domain + * levels so here's an arbitrary constant one for the moment. + * + * The levels alluded to here correspond to entries in struct + * sched_domain_topology_level that are meant to be populated by arch + * specific code (topology.c). + */ +#define NR_SD_LEVELS 8 + +#define SD_LEVEL0 0 +#define SD_LEVEL1 1 +#define SD_LEVEL2 2 +#define SD_LEVEL3 3 +#define SD_LEVEL4 4 +#define SD_LEVEL5 5 +#define SD_LEVEL6 6 +#define SD_LEVEL7 7 + +/* + * Convenience macro for iterating through said sd levels. + */ +#define for_each_possible_sd_level(level) \ + for (level = 0; level < NR_SD_LEVELS; level++) + +extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +void init_sched_energy_costs(void); + +#endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..a541b5ce1dcc 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,7 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c new file mode 100644 index 000000000000..b0656b7a93e3 --- /dev/null +++ b/kernel/sched/energy.c @@ -0,0 +1,124 @@ +/* + * Obtain energy cost data from DT and populate relevant scheduler data + * structures. + * + * Copyright (C) 2015 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#define pr_fmt(fmt) "sched-energy: " fmt + +#define DEBUG + +#include +#include +#include +#include +#include +#include + +struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +static void free_resources(void) +{ + int cpu, sd_level; + struct sched_group_energy *sge; + + for_each_possible_cpu(cpu) { + for_each_possible_sd_level(sd_level) { + sge = sge_array[cpu][sd_level]; + if (sge) { + kfree(sge->cap_states); + kfree(sge->idle_states); + kfree(sge); + } + } + } +} + +void init_sched_energy_costs(void) +{ + struct device_node *cn, *cp; + struct capacity_state *cap_states; + struct idle_state *idle_states; + struct sched_group_energy *sge; + const struct property *prop; + int sd_level, i, nstates, cpu; + const __be32 *val; + + for_each_possible_cpu(cpu) { + cn = of_get_cpu_node(cpu, NULL); + if (!cn) { + pr_warn("CPU device node missing for CPU %d\n", cpu); + return; + } + + if (!of_find_property(cn, "sched-energy-costs", NULL)) { + pr_warn("CPU device node has no sched-energy-costs\n"); + return; + } + + for_each_possible_sd_level(sd_level) { + cp = of_parse_phandle(cn, "sched-energy-costs", sd_level); + if (!cp) + break; + + prop = of_find_property(cp, "busy-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No busy-cost data, skipping sched_energy init\n"); + goto out; + } + + sge = kcalloc(1, sizeof(struct sched_group_energy), + GFP_NOWAIT); + + nstates = (prop->length / sizeof(u32)) / 2; + cap_states = kcalloc(nstates, + sizeof(struct capacity_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) { + cap_states[i].cap = be32_to_cpup(val++); + cap_states[i].power = be32_to_cpup(val++); + } + + sge->nr_cap_states = nstates; + sge->cap_states = cap_states; + + prop = of_find_property(cp, "idle-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No idle-cost data, skipping sched_energy init\n"); + goto out; + } + + nstates = (prop->length / sizeof(u32)); + idle_states = kcalloc(nstates, + sizeof(struct idle_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) + idle_states[i].power = be32_to_cpup(val++); + + sge->nr_idle_states = nstates; + sge->idle_states = idle_states; + + sge_array[cpu][sd_level] = sge; + } + } + + pr_info("Sched-energy-costs installed from DT\n"); + return; + +out: + free_resources(); +} From 8296e78e5eaa9bfebf251cbd4bddd97322fa13a9 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Tue, 9 Jun 2015 15:10:00 +0100 Subject: [PATCH 0081/3220] arm64, topology: Updates to use DT bindings for EAS costing data With the bindings and the associated accessors to extract data from the bindings in place, remove the static hard-coded data from topology.c and use the accesors instead. Signed-off-by: Robin Randhawa --- arch/arm64/kernel/topology.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index fb99a6735fd4..b5b43af6a7dc 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -218,6 +220,33 @@ out: struct cpu_topology cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); +/* sd energy functions */ +static inline +const struct sched_group_energy * const cpu_cluster_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu); + return NULL; + } + + return sge; +} + +static inline +const struct sched_group_energy * const cpu_core_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for CPU%d\n", cpu); + return NULL; + } + + return sge; +} + const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_sibling; @@ -344,4 +373,8 @@ void __init init_cpu_topology(void) */ if (of_have_populated_dt() && parse_dt_topology()) reset_cpu_topology(); + else + set_sched_topology(arm64_topology); + + init_sched_energy_costs(); } From b03f1ba3d5bb9db0ca495c8c33ef88588daaae50 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 22 Sep 2015 16:47:48 +0100 Subject: [PATCH 0082/3220] cpufreq: Max freq invariant scheduler load-tracking and cpu capacity support Implements cpufreq_scale_max_freq_capacity() to provide the scheduler with a maximum frequency scaling correction factor for more accurate load-tracking and cpu capacity handling by being able to deal with frequency capping. This scaling factor describes the influence of running a cpu with a current maximum frequency lower than the absolute possible maximum frequency on load tracking and cpu capacity. The factor is: current_max_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) In fact, max_freq_scale should be a struct cpufreq_policy data member. But this would require that the scheduler hot path (__update_load_avg()) would have to grab the cpufreq lock. This can be avoided by using per-cpu data initialized to SCHED_CAPACITY_SCALE for max_freq_scale. Signed-off-by: Dietmar Eggemann --- drivers/cpufreq/cpufreq.c | 19 +++++++++++++++++++ include/linux/cpufreq.h | 1 + 2 files changed, 20 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 46a7d62c8174..96f1a8860819 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -352,12 +352,14 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) *********************************************************************/ static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; +static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE; static void scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { unsigned long cur = freqs ? freqs->new : policy->cur; unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max; + struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo; int cpu; pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n", @@ -365,6 +367,18 @@ scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) for_each_cpu(cpu, policy->cpus) per_cpu(freq_scale, cpu) = scale; + + if (freqs) + return; + + scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq; + + pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n", + cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq, + scale); + + for_each_cpu(cpu, policy->cpus) + per_cpu(max_freq_scale, cpu) = scale; } unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) @@ -372,6 +386,11 @@ unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) return per_cpu(freq_scale, cpu); } +unsigned long cpufreq_scale_max_freq_capacity(int cpu) +{ + return per_cpu(max_freq_scale, cpu); +} + static void __cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 11c65f536b6c..9b1d61e1f1d7 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -619,4 +619,5 @@ int cpufreq_generic_init(struct cpufreq_policy *policy, struct sched_domain; unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +unsigned long cpufreq_scale_max_freq_capacity(int cpu); #endif /* _LINUX_CPUFREQ_H */ From 05a773bfc98b786cf504af1bce8bfe4a84a6a586 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 26 Sep 2015 18:19:54 +0100 Subject: [PATCH 0083/3220] sched: Update max cpu capacity in case of max frequency constraints Wakeup balancing uses cpu capacity awareness and needs to know the system-wide maximum cpu capacity. Patch "sched: Store system-wide maximum cpu capacity in root domain" finds the system-wide maximum cpu capacity during scheduler domain hierarchy setup. This is sufficient as long as maximum frequency invariance is not enabled. If it is enabled, the system-wide maximum cpu capacity can change between scheduler domain hierarchy setups due to frequency capping. The cpu capacity is changed in update_cpu_capacity() which is called in load balance on the lowest scheduler domain hierarchy level. To be able to know if a change in cpu capacity for a certain cpu also has an effect on the system-wide maximum cpu capacity it is normally necessary to iterate over all cpus. This would be way too costly. That's why this patch follows a different approach. The unsigned long max_cpu_capacity value in struct root_domain is replaced with a struct max_cpu_capacity, containing value (the max_cpu_capacity) and cpu (the cpu index of the cpu providing the maximum cpu_capacity). Changes to the system-wide maximum cpu capacity and the cpu index are made if: 1 System-wide maximum cpu capacity < cpu capacity 2 System-wide maximum cpu capacity > cpu capacity and cpu index == cpu There are no changes to the system-wide maximum cpu capacity in all other cases. Atomic read and write access to the pair (max_cpu_capacity.val, max_cpu_capacity.cpu) is enforced by max_cpu_capacity.lock. The access to max_cpu_capacity.val in task_fits_max() is still performed without taking the max_cpu_capacity.lock. The code to set max cpu capacity in build_sched_domains() has been removed because the whole functionality is now provided by update_cpu_capacity() instead. This approach can introduce errors temporarily, e.g. in case the cpu currently providing the max cpu capacity has its cpu capacity lowered due to frequency capping and calls update_cpu_capacity() before any cpu which might provide the max cpu now. There is also an outstanding question: Should the cpu capacity of a cpu going idle be set to a very small value? Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 8 ++------ kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++- kernel/sched/sched.h | 10 +++++++++- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc5bb26ccf3b..3680e76b6beb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5888,6 +5888,8 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; + + init_max_cpu_capacity(&rd->max_cpu_capacity); return 0; free_rto_mask: @@ -7105,15 +7107,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); - - if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity) - rq->rd->max_cpu_capacity = rq->cpu_capacity_orig; } rcu_read_unlock(); - if (rq) - pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity); - ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7327221c991e..91dfc8aa8aff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5078,7 +5078,7 @@ static inline bool __task_fits(struct task_struct *p, int cpu, int util) static inline bool task_fits_max(struct task_struct *p, int cpu) { unsigned long capacity = capacity_of(cpu); - unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; if (capacity == max_capacity) return true; @@ -6564,13 +6564,43 @@ static unsigned long scale_rt_capacity(int cpu) return 1; } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) +{ + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + (max_capacity < capacity)) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2301b0476b90..aac581932eff 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -510,6 +510,12 @@ struct dl_rq { #ifdef CONFIG_SMP +struct max_cpu_capacity { + raw_spinlock_t lock; + unsigned long val; + int cpu; +}; + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -548,7 +554,7 @@ struct root_domain { struct cpupri cpupri; /* Maximum cpu capacity in the system. */ - unsigned long max_cpu_capacity; + struct max_cpu_capacity max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -1340,6 +1346,8 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); + static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; From 5414a72fc30f2069e3943ceb4e39b0ba9cd9c7e3 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 23 Sep 2015 17:59:55 +0100 Subject: [PATCH 0084/3220] arm: Enable max freq invariant scheduler load-tracking and capacity support Maximum Frequency Invariance has to be part of Cpu Invariance because Frequency Invariance deals only with differences in load-tracking introduces by Dynamic Frequency Scaling and not with limiting the possible range of cpu frequency. By placing Maximum Frequency Invariance into Cpu Invariance, load-tracking is scaled via arch_scale_cpu_capacity() in __update_load_avg() and cpu capacity is scaled via arch_scale_cpu_capacity() in update_cpu_capacity(). To be able to save the extra multiplication in the scheduler hotpath (__update_load_avg()) we could: 1 Inform cpufreq about base cpu capacity at boot and let it handle scale_cpu_capacity() as well. 2 Use the cpufreq policy callback which would update a per-cpu current cpu_scale and this value would be return in scale_cpu_capacity(). 3 Use per-cpu current max_freq_scale and current cpu_scale with the current patch. Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index da1c611a3b5e..0308342def8c 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -44,7 +44,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale); unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { +#if CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else return per_cpu(cpu_scale, cpu); +#endif } static void set_capacity_scale(unsigned int cpu, unsigned long capacity) From b732dd602c61bab535b95237552e530ca8de8b93 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 25 Sep 2015 17:34:15 +0100 Subject: [PATCH 0085/3220] arm64: Enable max freq invariant scheduler load-tracking and capacity support Maximum Frequency Invariance has to be part of Cpu Invariance because Frequency Invariance deals only with differences in load-tracking introduces by Dynamic Frequency Scaling and not with limiting the possible range of cpu frequency. By placing Maximum Frequency Invariance into Cpu Invariance, load-tracking is scaled via arch_scale_cpu_capacity() in __update_load_avg() and cpu capacity is scaled via arch_scale_cpu_capacity() in update_cpu_capacity(). To be able to save the extra multiplication in the scheduler hotpath (__update_load_avg()) we could: 1 Inform cpufreq about base cpu capacity at boot and let it handle scale_cpu_capacity() as well. 2 Use the cpufreq policy callback which would update a per-cpu current cpu_scale and this value would be return in scale_cpu_capacity(). 3 Use per-cpu current max_freq_scale and current cpu_scale with the current patch. Including in topology.h like for the arm arch doesn't work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0). That's why cpufreq_scale_max_freq_capacity() has to be declared extern in topology.h. Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 1 + arch/arm64/kernel/topology.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 9370a336c934..bbd362cd1ed1 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -26,6 +26,7 @@ struct sched_domain; #ifdef CONFIG_CPU_FREQ #define arch_scale_freq_capacity cpufreq_scale_freq_capacity extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +extern unsigned long cpufreq_scale_max_freq_capacity(int cpu); #endif #define arch_scale_cpu_capacity scale_cpu_capacity extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index b5b43af6a7dc..5b2c67a510d8 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -29,7 +29,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { +#ifdef CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else return per_cpu(cpu_scale, cpu); +#endif } static void set_capacity_scale(unsigned int cpu, unsigned long capacity) From e14f1518239461ec91e3cb99fba6809c7d5fa4ad Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 13 Jan 2016 15:49:44 +0000 Subject: [PATCH 0086/3220] sched: Do eas idle balance regardless of the rq avg idle value EAS relies on idle balance to migrate a misfit task towards a cpu with higher capacity. When such a cpu becomes idle, idle balance should happen even if the rq avg idle is smaller than the sched migration cost (default 500us). The rq avg idle is updated during the wakeup of a task in case the rq has a non-null idle_stamp. This value stays unchanged and valid until the next task wakes up on this cpu after an idle period. So rq avg idle could be smaller than sched migration cost preventing the idle balance from happening. In this case we would be at the mercy of wakeup, periodic or nohz-idle load balancing to put another task on this cpu. To break this dependency towards rq avg idle make EAS idle balance independent from this rq avg idle has to be larger than sched migration cost. Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 91dfc8aa8aff..7d58b1fea71d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7772,8 +7772,9 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + if (!energy_aware() && + (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) From 563ddb604e01393625aa56601188b7eeb20888da Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:43:49 +0000 Subject: [PATCH 0087/3220] sched: Add per-cpu max capacity to sched_group_capacity struct sched_group_capacity currently represents the compute capacity sum of all cpus in the sched_group. Unless it is divided by the group_weight to get the average capacity per cpu it hides differences in cpu capacity for mixed capacity systems (e.g. high RT/IRQ utilization or ARM big.LITTLE). But even the average may not be sufficient if the group covers cpus of different capacities. Instead, by extending struct sched_group_capacity to indicate max per-cpu capacity in the group a suitable group for a given task utilization can easily be found such that cpus with reduced capacity can be avoided for tasks with high utilization (not implemented by this patch). Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 3 ++- kernel/sched/fair.c | 17 ++++++++++++----- kernel/sched/sched.h | 3 ++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3680e76b6beb..23e82805ec47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5714,7 +5714,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %d)", + printk(KERN_CONT " (cpu_capacity = %lu)", group->sgc->capacity); } @@ -6193,6 +6193,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d58b1fea71d..9762288ee47b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6609,13 +6609,14 @@ skip_unlock: __attribute__ ((unused)); cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6628,6 +6629,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -6652,11 +6654,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + max_capacity = max(capacity, max_capacity); } } else { /* @@ -6666,12 +6669,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = max_capacity; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aac581932eff..967d70d49af9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -858,7 +858,8 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity; + unsigned long capacity; + unsigned long max_capacity; /* Max per-cpu capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* From f2a8923298fe9a39cf4c9ad48bbcb0781483d1f9 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:47:54 +0000 Subject: [PATCH 0088/3220] sched: Add group_misfit_task load-balance type To maximize throughput in systems with reduced capacity cpus (e.g. high RT/IRQ load and/or ARM big.LITTLE) load-balancing has to consider task and cpu utilization as well as per-cpu compute capacity when load-balancing in addition to the current average load based load-balancing policy. Tasks that are scheduled on a reduced capacity cpu need to be identified and migrated to a higher capacity cpu if possible. To implement this additional policy an additional group_type (load-balance scenario) is added: group_misfit_task. This represents scenarios where a sched_group has tasks that are not suitable for its per-cpu capacity. group_misfit_task is only considered if the system is not overloaded in any other way (group_imbalanced or group_overloaded). Identifying misfit tasks requires the rq lock to be held. To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each cpu is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 29 ++++++++++++++++++++++------- kernel/sched/sched.h | 1 + 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9762288ee47b..b16e1b7878f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5739,6 +5739,8 @@ again: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; simple: cfs_rq = &rq->cfs; @@ -5760,9 +5762,12 @@ simple: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; idle: + rq->misfit_task = 0; /* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding @@ -5975,6 +5980,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -6446,12 +6458,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -6467,6 +6473,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + int group_misfit_task; /* A cpu has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6783,6 +6790,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task) + return group_misfit_task; + return group_other; } @@ -6830,8 +6840,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (idle_cpu(i)) sgs->idle_cpus++; - if (cpu_overutilized(i)) + if (cpu_overutilized(i)) { *overutilized = true; + if (!sgs->group_misfit_task && rq->misfit_task) + sgs->group_misfit_task = capacity_of(i); + } } /* Adjust by relative CPU capacity of the group */ @@ -8412,6 +8425,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) rq->rd->overutilized = true; + + rq->misfit_task = !task_fits_max(curr, rq->cpu); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 967d70d49af9..fcbb3e07accc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -584,6 +584,7 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; + unsigned int misfit_task; #ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; From 0d2b1cdfa11fd62d3358eef332e78b95b6a3e9ec Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:51:35 +0000 Subject: [PATCH 0089/3220] sched: Consider misfit tasks when load-balancing With the new group_misfit_task load-balancing scenario additional policy conditions are needed when load-balancing. Misfit task balancing only makes sense between source group with lower capacity than the target group. If capacities are the same, fallback to normal group_other balancing. The aim is to balance tasks such that no task has its throughput hindered by compute capacity if a cpu with more capacity is available. Load-balancing is generally based on average load in the sched_groups, but for misfitting tasks it is necessary to introduce exceptions to migrate tasks against usual metrics and optimize throughput. This patch ensures the following load-balance for mixed capacity systems (e.g. ARM big.LITTLE) for always-running tasks: 1. Place a task on each cpu starting in order from cpus with highest capacity to lowest until all cpus are in use (i.e. one task on each cpu). 2. Once all cpus are in use balance according to compute capacity such that load per capacity is approximately the same regardless of the compute capacity (i.e. big cpus get more tasks than little cpus). Necessary changes are introduced in find_busiest_group(), calculate_imbalance(), and find_busiest_queue(). This includes passing the group_type on to find_busiest_queue() through struct lb_env, which is currently only considers imbalance and not the imbalance situation (group_type). To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each cpu is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 71 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b16e1b7878f6..5b8a9c68f3fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6016,6 +6016,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type busiest_group_type; struct list_head tasks; }; @@ -6780,6 +6781,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } + +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-cpu capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE < + ref->sgc->max_capacity; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -6886,9 +6899,25 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; + if (sgs->avg_load <= busiest->avg_load) return false; + /* + * Candiate sg has no more than one task per cpu and has higher + * per-cpu capacity. No reason to pull tasks to less capable cpus. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -6994,6 +7023,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs->group_type = group_classify(sg, sgs); } + /* + * Ignore task groups with misfit tasks if local group has no + * capacity or if per-cpu capacity isn't higher. + */ + if (sgs->group_type == group_misfit_task && + (!group_has_capacity(env, &sds->local_stat) || + !group_smaller_cpu_capacity(sg, sds->local))) + sgs->group_type = group_other; + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; @@ -7170,6 +7208,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } + + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } + env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -7203,6 +7257,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task); + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -7276,6 +7335,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfitting tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) { + goto force_balance; + } + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -7299,7 +7363,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * might end up to just move the imbalance on another group */ if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + (local->idle_cpus <= (busiest->idle_cpus + 1)) && + !group_smaller_cpu_capacity(sds.busiest, sds.local)) goto out_balanced; } else { /* @@ -7312,6 +7377,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) } force_balance: + env->busiest_group_type = busiest->group_type; /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); return sds.busiest; @@ -7370,7 +7436,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (rq->nr_running == 1 && wl > env->imbalance && - !check_cpu_capacity(rq, env->sd)) + !check_cpu_capacity(rq, env->sd) && + env->busiest_group_type != group_misfit_task) continue; /* From c3b2e765af4af80ada13e5e34a26a28e3081dd0e Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Tue, 30 Jun 2015 12:45:27 +0100 Subject: [PATCH 0090/3220] cpufreq: introduce cpufreq_driver_is_slow Some architectures and platforms perform CPU frequency transitions through a non-blocking method, while some might block or sleep. Even when frequency transitions do not block or sleep they may be very slow. This distinction is important when trying to change frequency from a non-interruptible context in a scheduler hot path. Describe this distinction with a cpufreq driver flag, CPUFREQ_DRIVER_FAST. The default is to not have this flag set, thus erring on the side of caution. cpufreq_driver_is_slow() is also introduced in this patch. Setting the above flag will allow this function to return false. [smuckle@linaro.org: change flag/API to include drivers that are too slow for scheduler hot paths, in addition to those that block/sleep] Cc: Rafael J. Wysocki Cc: Viresh Kumar Signed-off-by: Michael Turquette Signed-off-by: Steve Muckle --- drivers/cpufreq/cpufreq.c | 6 ++++++ include/linux/cpufreq.h | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 96f1a8860819..5ca8e9517aae 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -154,6 +154,12 @@ bool have_governor_per_policy(void) } EXPORT_SYMBOL_GPL(have_governor_per_policy); +bool cpufreq_driver_is_slow(void) +{ + return !(cpufreq_driver->flags & CPUFREQ_DRIVER_FAST); +} +EXPORT_SYMBOL_GPL(cpufreq_driver_is_slow); + struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy) { if (have_governor_per_policy()) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9b1d61e1f1d7..b5ef79aae048 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -160,6 +160,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); int cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); +bool cpufreq_driver_is_slow(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); #else static inline unsigned int cpufreq_get(unsigned int cpu) @@ -317,6 +318,14 @@ struct cpufreq_driver { */ #define CPUFREQ_NEED_INITIAL_FREQ_CHECK (1 << 5) +/* + * Indicates that it is safe to call cpufreq_driver_target from + * non-interruptable context in scheduler hot paths. Drivers must + * opt-in to this flag, as the safe default is that they might sleep + * or be too slow for hot path use. + */ +#define CPUFREQ_DRIVER_FAST (1 << 6) + int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); From a967a45c71ae8978b2c0d06ee0de23f47737cfd5 Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Tue, 30 Jun 2015 12:45:48 +0100 Subject: [PATCH 0091/3220] sched: scheduler-driven cpu frequency selection Scheduler-driven CPU frequency selection hopes to exploit both per-task and global information in the scheduler to improve frequency selection policy, achieving lower power consumption, improved responsiveness/performance, and less reliance on heuristics and tunables. For further discussion on the motivation of this integration see [0]. This patch implements a shim layer between the Linux scheduler and the cpufreq subsystem. The interface accepts capacity requests from the CFS, RT and deadline sched classes. The requests from each sched class are summed on each CPU with a margin applied to the CFS and RT capacity requests to provide some headroom. Deadline requests are expected to be precise enough given their nature to not require headroom. The maximum total capacity request for a CPU in a frequency domain drives the requested frequency for that domain. Policy is determined by both the sched classes and this shim layer. Note that this algorithm is event-driven. There is no polling loop to check cpu idle time nor any other method which is unsynchronized with the scheduler, aside from a throttling mechanism to ensure frequency changes are not attempted faster than the hardware can accommodate them. Thanks to Juri Lelli for contributing design ideas, code and test results, and to Ricky Liang for initialization and static key inc/dec fixes. [0] http://article.gmane.org/gmane.linux.kernel/1499836 [smuckle@linaro.org: various additions and fixes, revised commit text] CC: Ricky Liang Signed-off-by: Michael Turquette Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- drivers/cpufreq/Kconfig | 21 ++ include/linux/cpufreq.h | 3 + include/linux/sched.h | 8 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched.c | 358 +++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 51 +++++ 7 files changed, 443 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/cpufreq_sched.c diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 659879a56dba..af523c539af0 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor. + +config CPU_FREQ_DEFAULT_GOV_SCHED + bool "sched" + select CPU_FREQ_GOV_SCHED + help + Use the CPUfreq governor 'sched' as default. This scales + cpu frequency using CPU utilization estimates from the + scheduler. + endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,18 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_GOV_SCHED + bool "'sched' cpufreq governor" + depends on CPU_FREQ + select CPU_FREQ_GOV_COMMON + help + 'sched' - this governor scales cpu frequency from the + scheduler as a function of cpu capacity utilization. It does + not evaluate utilization on a periodic basis (as ondemand + does) but instead is event-driven by the scheduler. + + If in doubt, say N. + comment "CPU frequency scaling drivers" config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b5ef79aae048..b5433349938a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -496,6 +496,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED) +extern struct cpufreq_governor cpufreq_gov_sched; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched) #endif /********************************************************************* diff --git a/include/linux/sched.h b/include/linux/sched.h index 09e2d43406eb..1460c6a4f65f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -927,6 +927,14 @@ enum cpu_idle_type { #define SCHED_CAPACITY_SHIFT 10 #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) +struct sched_capacity_reqs { + unsigned long cfs; + unsigned long rt; + unsigned long dl; + + unsigned long total; +}; + /* * Wake-queues are lists of tasks with a pending wakeup, whose * callers have already marked the task as woken internally, diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a541b5ce1dcc..0eabc9db4c3d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c new file mode 100644 index 000000000000..58bca8d2ca65 --- /dev/null +++ b/kernel/sched/cpufreq_sched.c @@ -0,0 +1,358 @@ +/* + * Copyright (C) 2015 Michael Turquette + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sched.h" + +#define THROTTLE_NSEC 50000000 /* 50ms default */ + +struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; +static bool __read_mostly cpufreq_driver_slow; + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static struct cpufreq_governor cpufreq_gov_sched; +#endif + +static DEFINE_PER_CPU(unsigned long, enabled); +DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); + +/** + * gov_data - per-policy data internal to the governor + * @throttle: next throttling period expiry. Derived from throttle_nsec + * @throttle_nsec: throttle period length in nanoseconds + * @task: worker thread for dvfs transition that may block/sleep + * @irq_work: callback used to wake up worker thread + * @requested_freq: last frequency requested by the sched governor + * + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A + * per-policy instance of it is created when the cpufreq_sched governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct gov_data { + ktime_t throttle; + unsigned int throttle_nsec; + struct task_struct *task; + struct irq_work irq_work; + unsigned int requested_freq; +}; + +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, + unsigned int freq) +{ + struct gov_data *gd = policy->governor_data; + + /* avoid race with cpufreq_sched_stop */ + if (!down_write_trylock(&policy->rwsem)) + return; + + __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); + + gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + up_write(&policy->rwsem); +} + +static bool finish_last_request(struct gov_data *gd) +{ + ktime_t now = ktime_get(); + + if (ktime_after(now, gd->throttle)) + return false; + + while (1) { + int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); + + usec_left /= NSEC_PER_USEC; + usleep_range(usec_left, usec_left + 100); + now = ktime_get(); + if (ktime_after(now, gd->throttle)) + return true; + } +} + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears down all of the data structures and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int cpufreq_sched_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned int new_request = 0; + unsigned int last_request = 0; + int ret; + + policy = (struct cpufreq_policy *) data; + gd = policy->governor_data; + + param.sched_priority = 50; + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); + if (ret) { + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + do_exit(-EINVAL); + } else { + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", + __func__, gd->task->pid); + } + + do { + set_current_state(TASK_INTERRUPTIBLE); + new_request = gd->requested_freq; + if (new_request == last_request) { + schedule(); + } else { + /* + * if the frequency thread sleeps while waiting to be + * unthrottled, start over to check for a newer request + */ + if (finish_last_request(gd)) + continue; + last_request = new_request; + cpufreq_sched_try_driver_target(policy, new_request); + } + } while (!kthread_should_stop()); + + return 0; +} + +static void cpufreq_sched_irq_work(struct irq_work *irq_work) +{ + struct gov_data *gd; + + gd = container_of(irq_work, struct gov_data, irq_work); + if (!gd) + return; + + wake_up_process(gd->task); +} + +static void update_fdomain_capacity_request(int cpu) +{ + unsigned int freq_new, index_new, cpu_tmp; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned long capacity = 0; + + /* + * Avoid grabbing the policy if possible. A test is still + * required after locking the CPU's policy to avoid racing + * with the governor changing. + */ + if (!per_cpu(enabled, cpu)) + return; + + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) + return; + + if (policy->governor != &cpufreq_gov_sched || + !policy->governor_data) + goto out; + + gd = policy->governor_data; + + /* find max capacity requested by cpus in this policy */ + for_each_cpu(cpu_tmp, policy->cpus) { + struct sched_capacity_reqs *scr; + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp); + capacity = max(capacity, scr->total); + } + + /* Convert the new maximum capacity request into a cpu frequency */ + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + if (cpufreq_frequency_table_target(policy, policy->freq_table, + freq_new, CPUFREQ_RELATION_L, + &index_new)) + goto out; + freq_new = policy->freq_table[index_new].frequency; + + if (freq_new == gd->requested_freq) + goto out; + + gd->requested_freq = freq_new; + + /* + * Throttling is not yet supported on platforms with fast cpufreq + * drivers. + */ + if (cpufreq_driver_slow) + irq_work_queue_on(&gd->irq_work, cpu); + else + cpufreq_sched_try_driver_target(policy, freq_new); + +out: + cpufreq_cpu_put(policy); +} + +void update_cpu_capacity_request(int cpu, bool request) +{ + unsigned long new_capacity; + struct sched_capacity_reqs *scr; + + /* The rq lock serializes access to the CPU's sched_capacity_reqs. */ + lockdep_assert_held(&cpu_rq(cpu)->lock); + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + + new_capacity = scr->cfs + scr->rt; + new_capacity = new_capacity * capacity_margin + / SCHED_CAPACITY_SCALE; + new_capacity += scr->dl; + + if (new_capacity == scr->total) + return; + + scr->total = new_capacity; + if (request) + update_fdomain_capacity_request(cpu); +} + +static inline void set_sched_freq(void) +{ + static_key_slow_inc(&__sched_freq); +} + +static inline void clear_sched_freq(void) +{ + static_key_slow_dec(&__sched_freq); +} + +static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + int cpu; + + for_each_cpu(cpu, policy->cpus) + memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, + sizeof(struct sched_capacity_reqs)); + + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) + return -ENOMEM; + + gd->throttle_nsec = policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_NSEC; + pr_debug("%s: throttle threshold = %u [ns]\n", + __func__, gd->throttle_nsec); + + if (cpufreq_driver_is_slow()) { + cpufreq_driver_slow = true; + gd->task = kthread_create(cpufreq_sched_thread, policy, + "kschedfreq:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR_OR_NULL(gd->task)) { + pr_err("%s: failed to create kschedfreq thread\n", + __func__); + goto err; + } + get_task_struct(gd->task); + kthread_bind_mask(gd->task, policy->related_cpus); + wake_up_process(gd->task); + init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); + } + + policy->governor_data = gd; + set_sched_freq(); + + return 0; + +err: + kfree(gd); + return -ENOMEM; +} + +static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) +{ + struct gov_data *gd = policy->governor_data; + + clear_sched_freq(); + if (cpufreq_driver_slow) { + kthread_stop(gd->task); + put_task_struct(gd->task); + } + + policy->governor_data = NULL; + + kfree(gd); + return 0; +} + +static int cpufreq_sched_start(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 1; + + return 0; +} + +static int cpufreq_sched_stop(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 0; + + return 0; +} + +static int cpufreq_sched_setup(struct cpufreq_policy *policy, + unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_POLICY_INIT: + return cpufreq_sched_policy_init(policy); + case CPUFREQ_GOV_POLICY_EXIT: + return cpufreq_sched_policy_exit(policy); + case CPUFREQ_GOV_START: + return cpufreq_sched_start(policy); + case CPUFREQ_GOV_STOP: + return cpufreq_sched_stop(policy); + case CPUFREQ_GOV_LIMITS: + break; + } + return 0; +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static +#endif +struct cpufreq_governor cpufreq_gov_sched = { + .name = "sched", + .governor = cpufreq_sched_setup, + .owner = THIS_MODULE, +}; + +static int __init cpufreq_sched_init(void) +{ + int cpu; + + for_each_cpu(cpu, cpu_possible_mask) + per_cpu(enabled, cpu) = 0; + return cpufreq_register_governor(&cpufreq_gov_sched); +} + +/* Try to make this the default governor */ +fs_initcall(cpufreq_sched_init); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b8a9c68f3fb..3eb5beb590a8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5064,7 +5064,7 @@ static inline unsigned long task_util(struct task_struct *p) return p->se.avg.util_avg; } -static unsigned int capacity_margin = 1280; /* ~20% margin */ +unsigned int capacity_margin = 1280; /* ~20% margin */ static inline bool __task_fits(struct task_struct *p, int cpu, int util) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fcbb3e07accc..ac28024b2d7a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1455,6 +1455,57 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +extern unsigned int capacity_margin; +extern struct static_key __sched_freq; + +static inline bool sched_freq(void) +{ + return static_key_false(&__sched_freq); +} + +DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); +void update_cpu_capacity_request(int cpu, bool request); + +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity; + update_cpu_capacity_request(cpu, request); + } +} +#else +static inline bool sched_freq(void) { return false; } +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); From ea429ccef14f4ef33e0a1c94a0369a435ae83807 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 19 Aug 2015 19:47:12 +0100 Subject: [PATCH 0092/3220] sched/fair: add triggers for OPP change requests Each time a task is {en,de}queued we might need to adapt the current frequency to the new usage. Add triggers on {en,de}queue_task_fair() for this purpose. Only trigger a freq request if we are effectively waking up or going to sleep. Filter out load balancing related calls to reduce the number of triggers. [smuckle@linaro.org: resolve merge conflicts, define task_new, use renamed static key sched_freq] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3eb5beb590a8..844788da00be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,6 +4144,21 @@ static inline void hrtick_update(struct rq *rq) } #endif +static unsigned long capacity_orig_of(int cpu); +static int cpu_util(int cpu); + +static void update_capacity_of(int cpu) +{ + unsigned long req_cap; + + if (!sched_freq()) + return; + + /* Convert scale-invariant capacity to cpu. */ + req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + set_cfs_cpu_capacity(cpu, true, req_cap); +} + static bool cpu_overutilized(int cpu); /* @@ -4193,6 +4208,20 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; + + /* + * We want to potentially trigger a freq switch + * request only for tasks that are waking up; this is + * because we get here also during load balancing, but + * in these cases it seems wise to trigger as single + * request after load balancing is done. + * + * XXX: how about fork()? Do we need a special + * flag/something to tell if we are here after a + * fork() (wakeup_task_new)? + */ + if (!task_new) + update_capacity_of(cpu_of(rq)); } hrtick_update(rq); } @@ -4251,9 +4280,24 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { sub_nr_running(rq, 1); + /* + * We want to potentially trigger a freq switch + * request only for tasks that are going to sleep; + * this is because we get here also during load + * balancing, but in these cases it seems wise to + * trigger as single request after load balancing is + * done. + */ + if (task_sleep) { + if (rq->cfs.nr_running) + update_capacity_of(cpu_of(rq)); + else if (sched_freq()) + set_cfs_cpu_capacity(cpu_of(rq), false, 0); + } + } hrtick_update(rq); } From 7ff814dd71331fc41d513f422da81ed65e56be47 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 26 Jun 2015 12:14:23 +0100 Subject: [PATCH 0093/3220] sched/{core,fair}: trigger OPP change request on fork() Patch "sched/fair: add triggers for OPP change requests" introduced OPP change triggers for enqueue_task_fair(), but the trigger was operating only for wakeups. Fact is that it makes sense to consider wakeup_new also (i.e., fork()), as we don't know anything about a newly created task and thus we most certainly want to jump to max OPP to not harm performance too much. However, it is not currently possible (or at least it wasn't evident to me how to do so :/) to tell new wakeups from other (non wakeup) operations. This patch introduces an additional flag in sched.h that is only set at fork() time and it is then consumed in enqueue_task_fair() for our purpose. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 9 +++------ kernel/sched/sched.h | 1 + 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 23e82805ec47..5c8d0dbaae5d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2387,7 +2387,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 844788da00be..3937f79240df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4171,7 +4171,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int task_new = !(flags & ENQUEUE_WAKEUP); + int task_new = flags & ENQUEUE_WAKEUP_NEW; + int task_wakeup = flags & ENQUEUE_WAKEUP; for_each_sched_entity(se) { if (se->on_rq) @@ -4215,12 +4216,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * because we get here also during load balancing, but * in these cases it seems wise to trigger as single * request after load balancing is done. - * - * XXX: how about fork()? Do we need a special - * flag/something to tell if we are here after a - * fork() (wakeup_task_new)? */ - if (!task_new) + if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } hrtick_update(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ac28024b2d7a..0052ada93ae8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1181,6 +1181,7 @@ static const u32 prio_to_wmult[40] = { #endif #define ENQUEUE_REPLENISH 0x08 #define ENQUEUE_RESTORE 0x10 +#define ENQUEUE_WAKEUP_NEW 0x20 #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 From f99e3fe6eb9cba8f645f8d00819d8007ee787e2c Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 25 Jun 2015 14:37:27 +0100 Subject: [PATCH 0094/3220] sched/fair: cpufreq_sched triggers for load balancing As we don't trigger freq changes from {en,de}queue_task_fair() during load balancing, we need to do explicitly so on load balancing paths. [smuckle@linaro.org: move update_capacity_of calls so rq lock is held] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/fair.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3937f79240df..950046b1b8eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6384,6 +6384,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p) { raw_spin_lock(&rq->lock); attach_task(rq, p); + /* + * We want to potentially raise target_cpu's OPP. + */ + update_capacity_of(cpu_of(rq)); raw_spin_unlock(&rq->lock); } @@ -6405,6 +6409,11 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } + /* + * We want to potentially raise env.dst_cpu's OPP. + */ + update_capacity_of(env->dst_cpu); + raw_spin_unlock(&env->dst_rq->lock); } @@ -7667,6 +7676,11 @@ more_balance: * ld_moved - cumulative load moved across iterations */ cur_ld_moved = detach_tasks(&env); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + if (cur_ld_moved) + update_capacity_of(env.src_cpu); /* * We've detached some tasks from busiest_rq. Every @@ -8037,8 +8051,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + update_capacity_of(env.src_cpu); + } else schedstat_inc(sd, alb_failed); } From 6b6c1924539526c89384067fe70b9fe856c7b626 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Thu, 25 Jun 2015 14:12:33 +0100 Subject: [PATCH 0095/3220] sched/fair: jump to max OPP when crossing UP threshold Since the true utilization of a long running task is not detectable while it is running and might be bigger than the current cpu capacity, create the maximum cpu capacity head room by requesting the maximum cpu capacity once the cpu usage plus the capacity margin exceeds the current capacity. This is also done to try to harm the performance of a task the least. Original fair-class only version authored by Juri Lelli . cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/core.c | 41 ++++++++++++++++++++++++++ kernel/sched/fair.c | 66 ------------------------------------------ kernel/sched/sched.h | 68 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 66 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c8d0dbaae5d..ac3d9af477f4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2854,6 +2854,45 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +static unsigned long sum_capacity_reqs(unsigned long cfs_cap, + struct sched_capacity_reqs *scr) +{ + unsigned long total = cfs_cap + scr->rt; + + total = total * capacity_margin; + total /= SCHED_CAPACITY_SCALE; + total += scr->dl; + return total; +} + +static void sched_freq_tick(int cpu) +{ + struct sched_capacity_reqs *scr; + unsigned long capacity_orig, capacity_curr; + + if (!sched_freq()) + return; + + capacity_orig = capacity_orig_of(cpu); + capacity_curr = capacity_curr_of(cpu); + if (capacity_curr == capacity_orig) + return; + + /* + * To make free room for a task that is building up its "real" + * utilization and to harm its performance the least, request + * a jump to max OPP as soon as the margin of free capacity is + * impacted (specified by capacity_margin). + */ + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr)) + set_cfs_cpu_capacity(cpu, true, capacity_max); +} +#else +static inline void sched_freq_tick(int cpu) { } +#endif + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -2880,6 +2919,8 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); + + sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 950046b1b8eb..bb1d0e3d7835 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,9 +4144,6 @@ static inline void hrtick_update(struct rq *rq) } #endif -static unsigned long capacity_orig_of(int cpu); -static int cpu_util(int cpu); - static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4521,15 +4518,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} static unsigned long cpu_avg_load_per_task(int cpu) { @@ -4698,60 +4686,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -static unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static unsigned long __cpu_util(int cpu, int delta) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); - - delta += util; - if (delta < 0) - return 0; - - return (delta >= capacity) ? capacity : delta; -} - -static unsigned long cpu_util(int cpu) -{ - return __cpu_util(cpu, 0); -} - static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0052ada93ae8..6aaff682adb8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1456,7 +1456,75 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_SMP +static inline unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static inline unsigned long __cpu_util(int cpu, int delta) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static inline unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); +} + +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static inline unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + +#endif + #ifdef CONFIG_CPU_FREQ_GOV_SCHED +#define capacity_max SCHED_CAPACITY_SCALE extern unsigned int capacity_margin; extern struct static_key __sched_freq; From 9d44dc7fe29c4b925ddbbd6a2551929da28dfbfb Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 25 Nov 2015 15:59:25 -0800 Subject: [PATCH 0096/3220] sched/cpufreq_sched: add trace events Trace events will aid in debugging, profiling and tuning. Signed-off-by: Steve Muckle --- include/trace/events/cpufreq_sched.h | 87 ++++++++++++++++++++++++++++ kernel/sched/cpufreq_sched.c | 9 +++ 2 files changed, 96 insertions(+) create mode 100644 include/trace/events/cpufreq_sched.h diff --git a/include/trace/events/cpufreq_sched.h b/include/trace/events/cpufreq_sched.h new file mode 100644 index 000000000000..a46cd088e969 --- /dev/null +++ b/include/trace/events/cpufreq_sched.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2015 Steve Muckle + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpufreq_sched + +#if !defined(_TRACE_CPUFREQ_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CPUFREQ_SCHED_H + +#include +#include + +TRACE_EVENT(cpufreq_sched_throttled, + TP_PROTO(unsigned int rem), + TP_ARGS(rem), + TP_STRUCT__entry( + __field( unsigned int, rem) + ), + TP_fast_assign( + __entry->rem = rem; + ), + TP_printk("throttled - %d usec remaining", __entry->rem) +); + +TRACE_EVENT(cpufreq_sched_request_opp, + TP_PROTO(int cpu, + unsigned long capacity, + unsigned int freq_new, + unsigned int requested_freq), + TP_ARGS(cpu, capacity, freq_new, requested_freq), + TP_STRUCT__entry( + __field( int, cpu) + __field( unsigned long, capacity) + __field( unsigned int, freq_new) + __field( unsigned int, requested_freq) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->capacity = capacity; + __entry->freq_new = freq_new; + __entry->requested_freq = requested_freq; + ), + TP_printk("cpu %d cap change, cluster cap request %ld => OPP %d " + "(cur %d)", + __entry->cpu, __entry->capacity, __entry->freq_new, + __entry->requested_freq) +); + +TRACE_EVENT(cpufreq_sched_update_capacity, + TP_PROTO(int cpu, + bool request, + struct sched_capacity_reqs *scr, + unsigned long new_capacity), + TP_ARGS(cpu, request, scr, new_capacity), + TP_STRUCT__entry( + __field( int, cpu) + __field( bool, request) + __field( unsigned long, cfs) + __field( unsigned long, rt) + __field( unsigned long, dl) + __field( unsigned long, total) + __field( unsigned long, new_total) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->request = request; + __entry->cfs = scr->cfs; + __entry->rt = scr->rt; + __entry->dl = scr->dl; + __entry->total = scr->total; + __entry->new_total = new_capacity; + ), + TP_printk("cpu=%d set_cap=%d cfs=%ld rt=%ld dl=%ld old_tot=%ld " + "new_tot=%ld", + __entry->cpu, __entry->request, __entry->cfs, __entry->rt, + __entry->dl, __entry->total, __entry->new_total) +); + +#endif /* _TRACE_CPUFREQ_SCHED_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 58bca8d2ca65..5afe56a82491 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -14,6 +14,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include "sched.h" #define THROTTLE_NSEC 50000000 /* 50ms default */ @@ -78,6 +81,7 @@ static bool finish_last_request(struct gov_data *gd) int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); usec_left /= NSEC_PER_USEC; + trace_cpufreq_sched_throttled(usec_left); usleep_range(usec_left, usec_left + 100); now = ktime_get(); if (ktime_after(now, gd->throttle)) @@ -186,6 +190,9 @@ static void update_fdomain_capacity_request(int cpu) goto out; freq_new = policy->freq_table[index_new].frequency; + trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, + gd->requested_freq); + if (freq_new == gd->requested_freq) goto out; @@ -222,6 +229,8 @@ void update_cpu_capacity_request(int cpu, bool request) if (new_capacity == scr->total) return; + trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity); + scr->total = new_capacity; if (request) update_fdomain_capacity_request(cpu); From cd248fa758ba2358d4dbf612090e6bbc075c46b1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 20 Oct 2015 10:46:26 +0200 Subject: [PATCH 0097/3220] sched: remove call of sched_avg_update from sched_rt_avg_update rt_avg is only used to scale the available CPU's capacity for CFS tasks. As the update of this scaling is done during periodic load balance, we only have to ensure that sched_avg_update has been called before any periodic load balancing. This requirement is already fulfilled by __update_cpu_load so the call in sched_rt_avg_update, which is part of the hotpath, is useless. Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6aaff682adb8..770e119ca692 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1578,7 +1578,6 @@ static inline void set_dl_cpu_capacity(int cpu, bool request, static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); - sched_avg_update(rq); } #else static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } From fab5cc59bf31aac4bcb222c7d1013a033cb8ef70 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 3 Nov 2015 10:39:01 +0100 Subject: [PATCH 0098/3220] sched: deadline: use deadline bandwidth in scale_rt_capacity Instead of monitoring the exec time of deadline tasks to evaluate the CPU capacity consumed by deadline scheduler class, we can directly calculate it thanks to the sum of utilization of deadline tasks on the CPU. We can remove deadline tasks from rt_avg metric and directly use the average bandwidth of deadline scheduler in scale_rt_capacity. Based in part on a similar patch from Luca Abeni . Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 8 ++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b0a15e285f9..9d9eb50d4059 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw += se_bw; +} + +static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw -= se_bw; + if (dl_rq->avg_bw < 0) { + WARN_ON(1); + dl_rq->avg_bw = 0; + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -494,6 +512,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + if (dl_se->dl_new) + add_average_bw(dl_se, dl_rq); + /* * The arrival of a new instance needs special treatment, i.e., * the actual scheduling parameters have to be "renewed". @@ -741,8 +762,6 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(dl_se)) { dl_se->dl_throttled = 1; @@ -1241,6 +1260,8 @@ static void task_fork_dl(struct task_struct *p) static void task_dead_dl(struct task_struct *p) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + struct dl_rq *dl_rq = dl_rq_of_se(&p->dl); + struct rq *rq = rq_of_dl_rq(dl_rq); /* * Since we are TASK_DEAD we won't slip out of the domain! @@ -1249,6 +1270,8 @@ static void task_dead_dl(struct task_struct *p) /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); + + clear_average_bw(&p->dl, &rq->dl); } static void set_curr_task_dl(struct rq *rq) @@ -1556,7 +1579,9 @@ retry: } deactivate_task(rq, next_task, 0); + clear_average_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_average_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1644,7 +1669,9 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + clear_average_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); + add_average_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -1750,6 +1777,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (!start_dl_timer(p)) __dl_clear_params(p); + clear_average_bw(&p->dl, &rq->dl); + /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb1d0e3d7835..651e160cdd67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6550,6 +6550,14 @@ static unsigned long scale_rt_capacity(int cpu) used = div_u64(avg, total); + /* + * deadline bandwidth is defined at system level so we must + * weight this bandwidth with the max capacity of the system. + * As a reminder, avg_bw is 20bits width and + * scale_cpu_capacity is 10 bits width + */ + used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu)); + if (likely(used < SCHED_CAPACITY_SCALE)) return SCHED_CAPACITY_SCALE - used; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 770e119ca692..983c132af11b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -506,6 +506,8 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* This is the "average utilization" for this runqueue */ + s64 avg_bw; }; #ifdef CONFIG_SMP From 6e1e1ed8720652305b77b6d01f80276ef076aff0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 26 Oct 2015 18:14:50 +0100 Subject: [PATCH 0099/3220] sched: rt scheduler sets capacity requirement RT tasks don't provide any running constraints like deadline ones except their running priority. The only current usable input to estimate the capacity needed by RT tasks is the rt_avg metric. We use it to estimate the CPU capacity needed for the RT scheduler class. In order to monitor the evolution for RT task load, we must peridiocally check it during the tick. Then, we use the estimated capacity of the last activity to estimate the next one which can not be that accurate but is a good starting point without any impact on the wake up path of RT tasks. Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/rt.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..9694204660b7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1426,6 +1426,41 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } +#ifdef CONFIG_SMP +static void sched_rt_update_capacity_req(struct rq *rq) +{ + u64 total, used, age_stamp, avg; + s64 delta; + + if (!sched_freq()) + return; + + sched_avg_update(rq); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); + delta = rq_clock(rq) - age_stamp; + + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; + + used = div_u64(avg, total); + if (unlikely(used > SCHED_CAPACITY_SCALE)) + used = SCHED_CAPACITY_SCALE; + + set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used)); +} +#else +static inline void sched_rt_update_capacity_req(struct rq *rq) +{ } + +#endif + static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { @@ -1494,8 +1529,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_queued) + if (!rt_rq->rt_queued) { + /* + * The next task to be picked on this rq will have a lower + * priority than rt tasks so we can spend some time to update + * the capacity used by rt tasks based on the last activity. + * This value will be the used as an estimation of the next + * activity. + */ + sched_rt_update_capacity_req(rq); return NULL; + } put_prev_task(rq, prev); @@ -2212,6 +2256,9 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) update_curr_rt(rq); + if (rq->rt.rt_nr_running) + sched_rt_update_capacity_req(rq); + watchdog(rq, p); /* From 3eb29105cfe14719e0b61c782efd4b3b509d786d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:55:51 +0000 Subject: [PATCH 0100/3220] fixup! sched: scheduler-driven cpu frequency selection Signed-off-by: Juri Lelli --- kernel/sched/cpufreq_sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 5afe56a82491..e1d208e101ed 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -119,9 +119,9 @@ static int cpufreq_sched_thread(void *data) } do { - set_current_state(TASK_INTERRUPTIBLE); new_request = gd->requested_freq; if (new_request == last_request) { + set_current_state(TASK_INTERRUPTIBLE); schedule(); } else { /* From 08d1cfd7c9193272e6caa100cbc5ef875cb5543c Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:58:05 +0000 Subject: [PATCH 0101/3220] fixup! sched/fair: jump to max OPP when crossing UP threshold Signed-off-by: Juri Lelli --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ac3d9af477f4..151ed9c62ad2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2910,6 +2910,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); calc_global_load_tick(rq); + sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -2919,8 +2920,6 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); - - sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL From 8aaf022dd7e5390b3e95fe92a8f49b1de949a687 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 30 Jun 2015 12:03:26 +0100 Subject: [PATCH 0102/3220] sched/tune: add detailed documentation The topic of a single simple power-performance tunable, that is wholly scheduler centric, and has well defined and predictable properties has come up on several occasions in the past. With techniques such as a scheduler driven DVFS, we now have a good framework for implementing such a tunable. This patch provides a detailed description of the motivations and design decisions behind the implementation of the SchedTune. cc: Jonathan Corbet cc: linux-doc@vger.kernel.org Signed-off-by: Patrick Bellasi --- Documentation/scheduler/sched-tune.txt | 366 +++++++++++++++++++++++++ 1 file changed, 366 insertions(+) create mode 100644 Documentation/scheduler/sched-tune.txt diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt new file mode 100644 index 000000000000..9bd2231c01b1 --- /dev/null +++ b/Documentation/scheduler/sched-tune.txt @@ -0,0 +1,366 @@ + Central, scheduler-driven, power-performance control + (EXPERIMENTAL) + +Abstract +======== + +The topic of a single simple power-performance tunable, that is wholly +scheduler centric, and has well defined and predictable properties has come up +on several occasions in the past [1,2]. With techniques such as a scheduler +driven DVFS [3], we now have a good framework for implementing such a tunable. +This document describes the overall ideas behind its design and implementation. + + +Table of Contents +================= + +1. Motivation +2. Introduction +3. Signal Boosting Strategy +4. OPP selection using boosted CPU utilization +5. Per task group boosting +6. Question and Answers + - What about "auto" mode? + - What about boosting on a congested system? + - How CPUs are boosted when we have tasks with multiple boost values? +7. References + + +1. Motivation +============= + +Sched-DVFS [3] is a new event-driven cpufreq governor which allows the +scheduler to select the optimal DVFS operating point (OPP) for running a task +allocated to a CPU. The introduction of sched-DVFS enables running workloads at +the most energy efficient OPPs. + +However, sometimes it may be desired to intentionally boost the performance of +a workload even if that could imply a reasonable increase in energy +consumption. For example, in order to reduce the response time of a task, we +may want to run the task at a higher OPP than the one that is actually required +by it's CPU bandwidth demand. + +This last requirement is especially important if we consider that one of the +main goals of the sched-DVFS component is to replace all currently available +CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling +driven governors we currently have, it is already more responsive at selecting +the optimal OPP to run tasks allocated to a CPU. However, just tracking the +actual task load demand may not be enough from a performance standpoint. For +example, it is not possible to get behaviors similar to those provided by the +"performance" and "interactive" CPUFreq governors. + +This document describes an implementation of a tunable, stacked on top of the +sched-DVFS which extends its functionality to support task performance +boosting. + +By "performance boosting" we mean the reduction of the time required to +complete a task activation, i.e. the time elapsed from a task wakeup to its +next deactivation (e.g. because it goes back to sleep or it terminates). For +example, if we consider a simple periodic task which executes the same workload +for 5[s] every 20[s] while running at a certain OPP, a boosted execution of +that task must complete each of its activations in less than 5[s]. + +A previous attempt [5] to introduce such a boosting feature has not been +successful mainly because of the complexity of the proposed solution. The +approach described in this document exposes a single simple interface to +user-space. This single tunable knob allows the tuning of system wide +scheduler behaviours ranging from energy efficiency at one end through to +incremental performance boosting at the other end. This first tunable affects +all tasks. However, a more advanced extension of the concept is also provided +which uses CGroups to boost the performance of only selected tasks while using +the energy efficient default for all others. + +The rest of this document introduces in more details the proposed solution +which has been named SchedTune. + + +2. Introduction +=============== + +SchedTune exposes a simple user-space interface with a single power-performance +tunable: + + /proc/sys/kernel/sched_cfs_boost + +This permits expressing a boost value as an integer in the range [0..100]. + +A value of 0 (default) configures the CFS scheduler for maximum energy +efficiency. This means that sched-DVFS runs the tasks at the minimum OPP +required to satisfy their workload demand. +A value of 100 configures scheduler for maximum performance, which translates +to the selection of the maximum OPP on that CPU. + +The range between 0 and 100 can be set to satisfy other scenarios suitably. For +example to satisfy interactive response or depending on other system events +(battery level etc). + +A CGroup based extension is also provided, which permits further user-space +defined task classification to tune the scheduler for different goals depending +on the specific nature of the task, e.g. background vs interactive vs +low-priority. + +The overall design of the SchedTune module is built on top of "Per-Entity Load +Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating +Performance Point (OPP) selection. +Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune +the operating frequency of that CPU to better match the workload demand. The +selection of the actual OPP being activated is influenced by the global boost +value, or the boost value for the task CGroup when in use. + +This simple biasing approach leverages existing frameworks, which means minimal +modifications to the scheduler, and yet it allows to achieve a range of +different behaviours all from a single simple tunable knob. +The only new concept introduced is that of signal boosting. + + +3. Signal Boosting Strategy +=========================== + +The whole PELT machinery works based on the value of a few load tracking signals +which basically track the CPU bandwidth requirements for tasks and the capacity +of CPUs. The basic idea behind the SchedTune knob is to artificially inflate +some of these load tracking signals to make a task or RQ appears more demanding +that it actually is. + +Which signals have to be inflated depends on the specific "consumer". However, +independently from the specific (signal, consumer) pair, it is important to +define a simple and possibly consistent strategy for the concept of boosting a +signal. + +A boosting strategy defines how the "abstract" user-space defined +sched_cfs_boost value is translated into an internal "margin" value to be added +to a signal to get its inflated value: + + margin := boosting_strategy(sched_cfs_boost, signal) + boosted_signal := signal + margin + +Different boosting strategies were identified and analyzed before selecting the +one found to be most effective. + +Signal Proportional Compensation (SPC) +-------------------------------------- + +In this boosting strategy the sched_cfs_boost value is used to compute a +margin which is proportional to the complement of the original signal. +When a signal has a maximum possible value, its complement is defined as +the delta from the actual value and its possible maximum. + +Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as +the maximum possible value, the margin becomes: + + margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal) + +Using this boosting strategy: +- a 100% sched_cfs_boost means that the signal is scaled to the maximum value +- each value in the range of sched_cfs_boost effectively inflates the signal in + question by a quantity which is proportional to the maximum value. + +For example, by applying the SPC boosting strategy to the selection of the OPP +to run a task it is possible to achieve these behaviors: + +- 0% boosting: run the task at the minimum OPP required by its workload +- 100% boosting: run the task at the maximum OPP available for the CPU +- 50% boosting: run at the half-way OPP between minimum and maximum + +Which means that, at 50% boosting, a task will be scheduled to run at half of +the maximum theoretically achievable performance on the specific target +platform. + +A graphical representation of an SPC boosted signal is represented in the +following figure where: + a) "-" represents the original signal + b) "b" represents a 50% boosted signal + c) "p" represents a 100% boosted signal + + + ^ + | SCHED_LOAD_SCALE + +-----------------------------------------------------------------+ + |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp + | + | boosted_signal + | bbbbbbbbbbbbbbbbbbbbbbbb + | + | original signal + | bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+ + | | + |bbbbbbbbbbbbbbbbbb | + | | + | | + | | + | +-----------------------+ + | | + | | + | | + |------------------+ + | + | + +-----------------------------------------------------------------------> + +The plot above shows a ramped load signal (titled 'original_signal') and it's +boosted equivalent. For each step of the original signal the boosted signal +corresponding to a 50% boost is midway from the original signal and the upper +bound. Boosting by 100% generates a boosted signal which is always saturated to +the upper bound. + + +4. OPP selection using boosted CPU utilization +============================================== + +It is worth calling out that the implementation does not introduce any new load +signals. Instead, it provides an API to tune existing signals. This tuning is +done on demand and only in scheduler code paths where it is sensible to do so. +The new API calls are defined to return either the default signal or a boosted +one, depending on the value of sched_cfs_boost. This is a clean an non invasive +modification of the existing existing code paths. + +The signal representing a CPU's utilization is boosted according to the +previously described SPC boosting strategy. To sched-DVFS, this allows a CPU +(ie CFS run-queue) to appear more used then it actually is. + +Thus, with the sched_cfs_boost enabled we have the following main functions to +get the current utilization of a CPU: + + cpu_util() + boosted_cpu_util() + +The new boosted_cpu_util() is similar to the first but returns a boosted +utilization signal which is a function of the sched_cfs_boost value. + +This function is used in the CFS scheduler code paths where sched-DVFS needs to +decide the OPP to run a CPU at. +For example, this allows selecting the highest OPP for a CPU which has +the boost value set to 100%. + + +5. Per task group boosting +========================== + +The availability of a single knob which is used to boost all tasks in the +system is certainly a simple solution but it quite likely doesn't fit many +utilization scenarios, especially in the mobile device space. + +For example, on battery powered devices there usually are many background +services which are long running and need energy efficient scheduling. On the +other hand, some applications are more performance sensitive and require an +interactive response and/or maximum performance, regardless of the energy cost. +To better service such scenarios, the SchedTune implementation has an extension +that provides a more fine grained boosting interface. + +A new CGroup controller, namely "schedtune", could be enabled which allows to +defined and configure task groups with different boosting values. +Tasks that require special performance can be put into separate CGroups. +The value of the boost associated with the tasks in this group can be specified +using a single knob exposed by the CGroup controller: + + schedtune.boost + +This knob allows the definition of a boost value that is to be used for +SPC boosting of all tasks attached to this group. + +The current schedtune controller implementation is really simple and has these +main characteristics: + + 1) It is only possible to create 1 level depth hierarchies + + The root control groups define the system-wide boost value to be applied + by default to all tasks. Its direct subgroups are named "boost groups" and + they define the boost value for specific set of tasks. + Further nested subgroups are not allowed since they do not have a sensible + meaning from a user-space standpoint. + + 2) It is possible to define only a limited number of "boost groups" + + This number is defined at compile time and by default configured to 16. + This is a design decision motivated by two main reasons: + a) In a real system we do not expect utilization scenarios with more then few + boost groups. For example, a reasonable collection of groups could be + just "background", "interactive" and "performance". + b) It simplifies the implementation considerably, especially for the code + which has to compute the per CPU boosting once there are multiple + RUNNABLE tasks with different boost values. + +Such a simple design should allow servicing the main utilization scenarios identified +so far. It provides a simple interface which can be used to manage the +power-performance of all tasks or only selected tasks. +Moreover, this interface can be easily integrated by user-space run-times (e.g. +Android, ChromeOS) to implement a QoS solution for task boosting based on tasks +classification, which has been a long standing requirement. + +Setup and usage +--------------- + +0. Use a kernel with CGROUP_SCHEDTUNE support enabled + +1. Check that the "schedtune" CGroup controller is available: + + root@linaro-nano:~# cat /proc/cgroups + #subsys_name hierarchy num_cgroups enabled + cpuset 0 1 1 + cpu 0 1 1 + schedtune 0 1 1 + +2. Mount a tmpfs to create the CGroups mount point (Optional) + + root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup + +3. Mount the "schedtune" controller + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune + root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune + +4. Setup the system-wide boost value (Optional) + + If not configured the root control group has a 0% boost value, which + basically disables boosting for all tasks in the system thus running in + an energy-efficient mode. + + root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost + +5. Create task groups and configure their specific boost value (Optional) + + For example here we create a "performance" boost group configure to boost + all its tasks to 100% + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance + root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost + +6. Move tasks into the boost group + + For example, the following moves the tasks with PID $TASKPID (and all its + threads) into the "performance" boost group. + + root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs + +This simple configuration allows only the threads of the $TASKPID task to run, +when needed, at the highest OPP in the most capable CPU of the system. + + +6. Question and Answers +======================= + +What about "auto" mode? +----------------------- + +The 'auto' mode as described in [5] can be implemented by interfacing SchedTune +with some suitable user-space element. This element could use the exposed +system-wide or cgroup based interface. + +How are multiple groups of tasks with different boost values managed? +--------------------------------------------------------------------- + +The current SchedTune implementation keeps track of the boosted RUNNABLE tasks +on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization +is boosted with a value which is the maximum of the boost values of the +currently RUNNABLE tasks in its RQ. + +This allows sched-DVFS to boost a CPU only while there are boosted tasks ready +to run and switch back to the energy efficient mode as soon as the last boosted +task is dequeued. + + +7. References +============= +[1] http://lwn.net/Articles/552889 +[2] http://lkml.org/lkml/2012/5/18/91 +[3] http://lkml.org/lkml/2015/6/26/620 From 344f4ec4293b75d83282eb13ede965995e5f312f Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 18:11:44 +0100 Subject: [PATCH 0103/3220] sched/tune: add sysctl interface to define a boost value The current (CFS) scheduler implementation does not allow "to boost" tasks performance by running them at a higher OPP compared to the minimum required to meet their workload demands. To support tasks performance boosting the scheduler should provide a "knob" which allows to tune how much the system is going to be optimised for energy efficiency vs performance. This patch is the first of a series which provides a simple interface to define a tuning knob. One system-wide "boost" tunable is exposed via: /proc/sys/kernel/sched_cfs_boost which can be configured in the range [0..100], to define a percentage where: - 0% boost requires to operate in "standard" mode by scheduling tasks at the minimum capacities required by the workload demand - 100% boost requires to push at maximum the task performances, "regardless" of the incurred energy consumption A boost value in between these two boundaries is used to bias the power/performance trade-off, the higher the boost value the more the scheduler is biased toward performance boosting instead of energy efficiency. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- include/linux/sched/sysctl.h | 16 ++++++++++++++++ init/Kconfig | 26 ++++++++++++++++++++++++++ kernel/sched/Makefile | 1 + kernel/sched/tune.c | 16 ++++++++++++++++ kernel/sysctl.c | 11 +++++++++++ 5 files changed, 70 insertions(+) create mode 100644 kernel/sched/tune.c diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c9e4731cf10b..4479e48c7712 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -77,6 +77,22 @@ extern int sysctl_sched_rt_runtime; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif +#ifdef CONFIG_SCHED_TUNE +extern unsigned int sysctl_sched_cfs_boost; +int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos); +static inline unsigned int get_sysctl_sched_cfs_boost(void) +{ + return sysctl_sched_cfs_boost; +} +#else +static inline unsigned int get_sysctl_sched_cfs_boost(void) +{ + return 0; +} +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; #endif diff --git a/init/Kconfig b/init/Kconfig index 235c7a2c0d20..83d8c02e7a57 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1237,6 +1237,32 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TUNE + bool "Boosting for CFS tasks (EXPERIMENTAL)" + help + This option enables the system-wide support for task boosting. + When this support is enabled a new sysctl interface is exposed to + userspace via: + /proc/sys/kernel/sched_cfs_boost + which allows to set a system-wide boost value in range [0..100]. + + The currently boosting strategy is implemented in such a way that: + - a 0% boost value requires to operate in "standard" mode by + scheduling all tasks at the minimum capacities required by their + workload demand + - a 100% boost value requires to push at maximum the task + performances, "regardless" of the incurred energy consumption + + A boost value in between these two boundaries is used to bias the + power/performance trade-off, the higher the boost value the more the + scheduler is biased toward performance boosting instead of energy + efficiency. + + Since this support exposes a single system-wide knob, the specified + boost value is applied to all (CFS) tasks in the system. + + If unsure, say N. + config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 0eabc9db4c3d..c6a85f813dfd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -18,5 +18,6 @@ obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c new file mode 100644 index 000000000000..a93af9c2f267 --- /dev/null +++ b/kernel/sched/tune.c @@ -0,0 +1,16 @@ +#include "sched.h" + +unsigned int sysctl_sched_cfs_boost __read_mostly; + +int +sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + return 0; +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d6639e..827ba256fdad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -434,6 +434,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#ifdef CONFIG_SCHED_TUNE + { + .procname = "sched_cfs_boost", + .data = &sysctl_sched_cfs_boost, + .maxlen = sizeof(sysctl_sched_cfs_boost), + .mode = 0644, + .proc_handler = &sysctl_sched_cfs_boost_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", From 6ed27145013d97e49d7e71a56601033cccee743c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 18:32:36 +0100 Subject: [PATCH 0104/3220] sched/fair: add function to convert boost value into "margin" The basic idea of the boost knob is to "artificially inflate" a signal to make a task or logical CPU appears more demanding than it actually is. Independently from the specific signal, a consistent and possibly simple semantic for the concept of "signal boosting" must define: 1. how we translate the boost percentage into a "margin" value to be added to the original signal to inflate 2. what is the meaning of a boost value from a user-space perspective This patch provides the implementation of a possible boost semantic, named "Signal Proportional Compensation" (SPC), where the boost percentage (BP) is used to compute a margin (M) which is proportional to the complement of the original signal (OS): M = BP * (SCHED_LOAD_SCALE - OS) The computed margin then added to the OS to obtain the Boosted Signal (BS) BS = OS + M The proposed boost semantic has these main features: - each signal gets a boost which is proportional to its delta with respect to the maximum available capacity in the system (i.e. SCHED_LOAD_SCALE) - a 100% boosting has a clear understanding from a user-space perspective, since it means simply to run (possibly) "all" tasks at the max OPP - each boosting value means to improve the task performance by a quantity which is proportional to the maximum achievable performance on that system Thus this semantics is somehow forcing a behaviour which is: 50% boosting means to run at half-way between the current and the maximum performance which a task could achieve on that system This patch provides the code to implement a fast integer division to convert a boost percentage (BP) value into a margin (M). NOTE: this code is suitable for all signals operating in range [0..SCHED_LOAD_SCALE] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 651e160cdd67..1b6a57837433 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5074,6 +5074,44 @@ static bool cpu_overutilized(int cpu) return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); } +#ifdef CONFIG_SCHED_TUNE + +static unsigned long +schedtune_margin(unsigned long signal, unsigned long boost) +{ + unsigned long long margin = 0; + + /* + * Signal proportional compensation (SPC) + * + * The Boost (B) value is used to compute a Margin (M) which is + * proportional to the complement of the original Signal (S): + * M = B * (SCHED_LOAD_SCALE - S) + * The obtained M could be used by the caller to "boost" S. + */ + margin = SCHED_LOAD_SCALE - signal; + margin *= boost; + + /* + * Fast integer division by constant: + * Constant : (C) = 100 + * Precision : 0.1% (P) = 0.1 + * Reference : C * 100 / P (R) = 100000 + * + * Thus: + * Shift bits : ceil(log(R,2)) (S) = 17 + * Mult const : round(2^S/C) (M) = 1311 + * + * + */ + margin *= 1311; + margin >>= 17; + + return margin; +} + +#endif /* CONFIG_SCHED_TUNE */ + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From 07e22949de7116dfac9098a46109c30776ead625 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 26 Jun 2015 09:55:06 +0100 Subject: [PATCH 0105/3220] sched/fair: add boosted CPU usage The CPU usage signal is used by the scheduler as an estimation of the overall bandwidth currently allocated on a CPU. When SchedDVFS is in use, this signal affects the selection of the operating points (OPP) required to accommodate all the workload allocated in a CPU. A convenient way to boost the performance of tasks running on a CPU, which is also little intrusive, is to boost the CPU usage signal each time it is used to select an OPP. This patch introduces a new function: get_boosted_cpu_usage(cpu) to return a boosted value for the usage of a specified CPU. The margin added to the original usage is: 1. computed based on the "boosting strategy" in use 2. proportional to the system-wide boost value defined by provided user-space interface The boosted signal is used by SchedDVFS (transparently) each time it requires to get an estimation of the capacity required for a CPU. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b6a57837433..eff548fae43e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,6 +4144,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static inline unsigned long boosted_cpu_util(int cpu); + static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4152,7 +4154,8 @@ static void update_capacity_of(int cpu) return; /* Convert scale-invariant capacity to cpu. */ - req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + req_cap = boosted_cpu_util(cpu); + req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); } @@ -5110,8 +5113,36 @@ schedtune_margin(unsigned long signal, unsigned long boost) return margin; } +static inline unsigned int +schedtune_cpu_margin(unsigned long util) +{ + unsigned int boost = get_sysctl_sched_cfs_boost(); + + if (boost == 0) + return 0; + + return schedtune_margin(util, boost); +} + +#else /* CONFIG_SCHED_TUNE */ + +static inline unsigned int +schedtune_cpu_margin(unsigned long util) +{ + return 0; +} + #endif /* CONFIG_SCHED_TUNE */ +static inline unsigned long +boosted_cpu_util(int cpu) +{ + unsigned long util = cpu_util(cpu); + unsigned long margin = schedtune_cpu_margin(util); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From 13001f47c9a610705219700af4636386b647e231 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 23 Jun 2015 09:17:54 +0100 Subject: [PATCH 0106/3220] sched/tune: add initial support for CGroups based boosting To support task performance boosting, the usage of a single knob has the advantage to be a simple solution, both from the implementation and the usability standpoint. However, on a real system it can be difficult to identify a single value for the knob which fits the needs of multiple different tasks. For example, some kernel threads and/or user-space background services should be better managed the "standard" way while we still want to be able to boost the performance of specific workloads. In order to improve the flexibility of the task boosting mechanism this patch is the first of a small series which extends the previous implementation to introduce a "per task group" support. This first patch introduces just the basic CGroups support, a new "schedtune" CGroups controller is added which allows to configure different boost value for different groups of tasks. To keep the implementation simple but still effective for a boosting strategy, the new controller: 1. allows only a two layer hierarchy 2. supports only a limited number of boost groups A two layer hierarchy allows to place each task either: a) in the root control group thus being subject to a system-wide boosting value b) in a child of the root group thus being subject to the specific boost value defined by that "boost group" The limited number of "boost groups" supported is mainly motivated by the observation that in a real system it could be useful to have only few classes of tasks which deserve different treatment. For example, background vs foreground or interactive vs low-priority. As an additional benefit, a limited number of boost groups allows also to have a simpler implementation especially for the code required to compute the boost value for CPUs which have runnable tasks belonging to different boost groups. cc: Tejun Heo cc: Li Zefan cc: Johannes Weiner cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- include/linux/cgroup_subsys.h | 4 + init/Kconfig | 17 +++ kernel/sched/tune.c | 223 ++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 4 + 4 files changed, 248 insertions(+) diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 1a96fdaa33d5..e133705d794a 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -26,6 +26,10 @@ SUBSYS(cpu) SUBSYS(cpuacct) #endif +#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE) +SUBSYS(schedtune) +#endif + #if IS_ENABLED(CONFIG_BLK_CGROUP) SUBSYS(io) #endif diff --git a/init/Kconfig b/init/Kconfig index 83d8c02e7a57..5d9097e2b805 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -999,6 +999,23 @@ config CGROUP_CPUACCT Provides a simple Resource Controller for monitoring the total CPU consumed by the tasks in a cgroup. +config CGROUP_SCHEDTUNE + bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)" + depends on SCHED_TUNE + help + This option provides the "schedtune" controller which improves the + flexibility of the task boosting mechanism by introducing the support + to define "per task" boost values. + + This new controller: + 1. allows only a two layers hierarchy, where the root defines the + system-wide boost value and its direct childrens define each one a + different "class of tasks" to be boosted with a different value + 2. supports up to 16 different task classes, each one which could be + configured with a different boost value + + Say N if unsure. + config PAGE_COUNTER bool diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index a93af9c2f267..95bc8b87c6d4 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -1,7 +1,230 @@ +#include +#include +#include +#include +#include + #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; +#ifdef CONFIG_CGROUP_SCHEDTUNE + +/* + * EAS scheduler tunables for task groups. + */ + +/* SchdTune tunables for a group of tasks */ +struct schedtune { + /* SchedTune CGroup subsystem */ + struct cgroup_subsys_state css; + + /* Boost group allocated ID */ + int idx; + + /* Boost value for tasks on that SchedTune CGroup */ + int boost; + +}; + +static inline struct schedtune *css_st(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct schedtune, css) : NULL; +} + +static inline struct schedtune *task_schedtune(struct task_struct *tsk) +{ + return css_st(task_css(tsk, schedtune_cgrp_id)); +} + +static inline struct schedtune *parent_st(struct schedtune *st) +{ + return css_st(st->css.parent); +} + +/* + * SchedTune root control group + * The root control group is used to defined a system-wide boosting tuning, + * which is applied to all tasks in the system. + * Task specific boost tuning could be specified by creating and + * configuring a child control group under the root one. + * By default, system-wide boosting is disabled, i.e. no boosting is applied + * to tasks which are not into a child control group. + */ +static struct schedtune +root_schedtune = { + .boost = 0, +}; + +/* + * Maximum number of boost groups to support + * When per-task boosting is used we still allow only limited number of + * boost groups for two main reasons: + * 1. on a real system we usually have only few classes of workloads which + * make sense to boost with different values (e.g. background vs foreground + * tasks, interactive vs low-priority tasks) + * 2. a limited number allows for a simpler and more memory/time efficient + * implementation especially for the computation of the per-CPU boost + * value + */ +#define BOOSTGROUPS_COUNT 4 + +/* Array of configured boostgroups */ +static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { + &root_schedtune, + NULL, +}; + +/* SchedTune boost groups + * Keep track of all the boost groups which impact on CPU, for example when a + * CPU has two RUNNABLE tasks belonging to two different boost groups and thus + * likely with different boost values. + * Since on each system we expect only a limited number of boost groups, here + * we use a simple array to keep track of the metrics required to compute the + * maximum per-CPU boosting value. + */ +struct boost_groups { + /* Maximum boost value for all RUNNABLE tasks on a CPU */ + unsigned boost_max; + struct { + /* The boost for tasks on that boost group */ + unsigned boost; + /* Count of RUNNABLE tasks on that boost group */ + unsigned tasks; + } group[BOOSTGROUPS_COUNT]; +}; + +/* Boost groups affecting each CPU in the system */ +DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); + +static u64 +boost_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->boost; +} + +static int +boost_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 boost) +{ + struct schedtune *st = css_st(css); + + if (boost < 0 || boost > 100) + return -EINVAL; + + st->boost = boost; + if (css == &root_schedtune.css) + sysctl_sched_cfs_boost = boost; + + return 0; +} + +static struct cftype files[] = { + { + .name = "boost", + .read_u64 = boost_read, + .write_u64 = boost_write, + }, + { } /* terminate */ +}; + +static int +schedtune_boostgroup_init(struct schedtune *st) +{ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = st; + + return 0; +} + +static int +schedtune_init(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + } + + pr_info(" schedtune configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); + return 0; +} + +static struct cgroup_subsys_state * +schedtune_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct schedtune *st; + int idx; + + if (!parent_css) { + schedtune_init(); + return &root_schedtune.css; + } + + /* Allow only single level hierachies */ + if (parent_css != &root_schedtune.css) { + pr_err("Nested SchedTune boosting groups not allowed\n"); + return ERR_PTR(-ENOMEM); + } + + /* Allow only a limited number of boosting groups */ + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) + if (!allocated_group[idx]) + break; + if (idx == BOOSTGROUPS_COUNT) { + pr_err("Trying to create more than %d SchedTune boosting groups\n", + BOOSTGROUPS_COUNT); + return ERR_PTR(-ENOSPC); + } + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto out; + + /* Initialize per CPUs boost group support */ + st->idx = idx; + if (schedtune_boostgroup_init(st)) + goto release; + + return &st->css; + +release: + kfree(st); +out: + return ERR_PTR(-ENOMEM); +} + +static void +schedtune_boostgroup_release(struct schedtune *st) +{ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = NULL; +} + +static void +schedtune_css_free(struct cgroup_subsys_state *css) +{ + struct schedtune *st = css_st(css); + + schedtune_boostgroup_release(st); + kfree(st); +} + +struct cgroup_subsys schedtune_cgrp_subsys = { + .css_alloc = schedtune_css_alloc, + .css_free = schedtune_css_free, + .legacy_cftypes = files, + .early_init = 1, +}; + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 827ba256fdad..98c0f6ef01f1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -439,7 +439,11 @@ static struct ctl_table kern_table[] = { .procname = "sched_cfs_boost", .data = &sysctl_sched_cfs_boost, .maxlen = sizeof(sysctl_sched_cfs_boost), +#ifdef CONFIG_CGROUP_SCHEDTUNE + .mode = 0444, +#else .mode = 0644, +#endif .proc_handler = &sysctl_sched_cfs_boost_handler, .extra1 = &zero, .extra2 = &one_hundred, From 9cd53fbeb2ad365aac06eebed2baee4984a283b0 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 12:31:35 +0000 Subject: [PATCH 0107/3220] sched/tune: compute and keep track of per CPU boost value When per task boosting is enabled, we could have multiple RUNNABLE tasks which are concurrently scheduled on the same CPU but each one with a different boost value. For example, we could have a scenarios like this: Task SchedTune CGroup Boost Value T1 root 0 T2 low-priority 10 T3 interactive 90 In these conditions we expect a CPU to be configured according to a proper "aggregation" of the required boost values for all the tasks currently scheduled on this CPU. A suitable aggregation function is the one which tracks the MAX boost value for all the tasks RUNNABLE on a CPU. This approach allows to always satisfy the most boost demanding task while at the same time: a) boosting all the concurrently scheduled tasks thus reducing potential co-scheduling side-effects on demanding tasks b) reduce the number of frequency switch requested towards SchedDVFS, thus being more friendly to architectures with slow frequency switching times Every time a task enters/exits the RQ of a CPU the max boost value should be updated considering all the boost groups currently "affecting" that CPU, i.e. which have at least one RUNNABLE task currently allocated on that CPU. This patch introduces the required support to keep track of the boost groups currently affecting CPUs. Thanks to the limited number of boost groups, a small and memory efficient per-cpu array of boost groups values (cpu_boost_groups) is used which is updated for each CPU entry by schedtune_boostgroup_update() but only when a schedtune CGroup boost value is updated. However, this is expected to be a rare operation, perhaps done just one time at system boot time. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 77 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 95bc8b87c6d4..f62386893725 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -97,6 +97,67 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +static void +schedtune_cpu_update(int cpu) +{ + struct boost_groups *bg; + unsigned boost_max; + int idx; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* The root boost group is always active */ + boost_max = bg->group[0].boost; + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { + /* + * A boost group affects a CPU only if it has + * RUNNABLE tasks on that CPU + */ + if (bg->group[idx].tasks == 0) + continue; + boost_max = max(boost_max, bg->group[idx].boost); + } + + bg->boost_max = boost_max; +} + +static int +schedtune_boostgroup_update(int idx, int boost) +{ + struct boost_groups *bg; + int cur_boost_max; + int old_boost; + int cpu; + + /* Update per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + + /* + * Keep track of current boost values to compute the per CPU + * maximum only when it has been affected by the new value of + * the updated boost group + */ + cur_boost_max = bg->boost_max; + old_boost = bg->group[idx].boost; + + /* Update the boost value of this boost group */ + bg->group[idx].boost = boost; + + /* Check if this update increase current max */ + if (boost > cur_boost_max && bg->group[idx].tasks) { + bg->boost_max = boost; + continue; + } + + /* Check if this update has decreased current max */ + if (cur_boost_max == old_boost && old_boost > boost) + schedtune_cpu_update(cpu); + } + + return 0; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -118,6 +179,9 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, if (css == &root_schedtune.css) sysctl_sched_cfs_boost = boost; + /* Update CPU boost */ + schedtune_boostgroup_update(st->idx, st->boost); + return 0; } @@ -133,9 +197,19 @@ static struct cftype files[] = { static int schedtune_boostgroup_init(struct schedtune *st) { + struct boost_groups *bg; + int cpu; + /* Keep track of allocated boost groups */ allocated_group[st->idx] = st; + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + bg->group[st->idx].boost = 0; + bg->group[st->idx].tasks = 0; + } + return 0; } @@ -203,6 +277,9 @@ out: static void schedtune_boostgroup_release(struct schedtune *st) { + /* Reset this boost group */ + schedtune_boostgroup_update(st->idx, 0); + /* Keep track of allocated boost groups */ allocated_group[st->idx] = NULL; } From a515b887ece159dfc45010667c43bf4b834ee325 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 7 Jul 2015 15:33:20 +0100 Subject: [PATCH 0108/3220] sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value When per-task boosting is enabled, every time a task enters/exits a CPU its boost value could impact the currently selected OPP for that CPU. Thus, the "aggregated" boost value for that CPU potentially needs to be updated to match the current maximum boost value among all the tasks currently RUNNABLE on that CPU. This patch introduces the required support to keep track of which boost groups are impacting a CPU. Each time a task is enqueued/dequeued to/from a CPU its boost group is used to increment a per-cpu counter of RUNNABLE tasks on that CPU. Only when the number of runnable tasks for a specific boost group becomes 1 or 0 the corresponding boost group changes its effects on that CPU, specifically: a) boost_group::tasks == 1: this boost group starts to impact the CPU b) boost_group::tasks == 0: this boost group stops to impact the CPU In each of these two conditions the aggregation function: sched_cpu_update(cpu) could be required to run in order to identify the new maximum boost value required for the CPU. The proposed patch minimizes the number of times the aggregation function is executed while still providing the required support to always boost a CPU to the maximum boost value required by all its currently RUNNABLE tasks. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 17 +++++++--- kernel/sched/tune.c | 82 +++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 23 +++++++++++++ 3 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 kernel/sched/tune.h diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eff548fae43e..2dbe1ff0a90b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -34,6 +34,7 @@ #include #include "sched.h" +#include "tune.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -4210,6 +4211,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; + schedtune_enqueue_task(p, cpu_of(rq)); + /* * We want to potentially trigger a freq switch * request only for tasks that are waking up; this is @@ -4279,6 +4282,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { sub_nr_running(rq, 1); + schedtune_dequeue_task(p, cpu_of(rq)); /* * We want to potentially trigger a freq switch @@ -5114,10 +5118,15 @@ schedtune_margin(unsigned long signal, unsigned long boost) } static inline unsigned int -schedtune_cpu_margin(unsigned long util) +schedtune_cpu_margin(unsigned long util, int cpu) { - unsigned int boost = get_sysctl_sched_cfs_boost(); + unsigned int boost; +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_cpu_boost(cpu); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif if (boost == 0) return 0; @@ -5127,7 +5136,7 @@ schedtune_cpu_margin(unsigned long util) #else /* CONFIG_SCHED_TUNE */ static inline unsigned int -schedtune_cpu_margin(unsigned long util) +schedtune_cpu_margin(unsigned long util, int cpu) { return 0; } @@ -5138,7 +5147,7 @@ static inline unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); - unsigned long margin = schedtune_cpu_margin(util); + unsigned long margin = schedtune_cpu_margin(util, cpu); return util + margin; } diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index f62386893725..540b945a01ce 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "sched.h" @@ -158,6 +159,87 @@ schedtune_boostgroup_update(int idx, int boost) return 0; } +static inline void +schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) +{ + struct boost_groups *bg; + int tasks; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* Update boosted tasks count while avoiding to make it negative */ + if (task_count < 0 && bg->group[idx].tasks <= -task_count) + bg->group[idx].tasks = 0; + else + bg->group[idx].tasks += task_count; + + /* Boost group activation or deactivation on that RQ */ + tasks = bg->group[idx].tasks; + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_enqueue_task(struct task_struct *p, int cpu) +{ + struct schedtune *st; + int idx; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + */ + if (p->flags & PF_EXITING) + return; + + /* Get task boost group */ + rcu_read_lock(); + st = task_schedtune(p); + idx = st->idx; + rcu_read_unlock(); + + schedtune_tasks_update(p, cpu, idx, 1); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_dequeue_task(struct task_struct *p, int cpu) +{ + struct schedtune *st; + int idx; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + * The last dequeue will be done by cgroup exit() callback. + */ + if (p->flags & PF_EXITING) + return; + + /* Get task boost group */ + rcu_read_lock(); + st = task_schedtune(p); + idx = st->idx; + rcu_read_unlock(); + + schedtune_tasks_update(p, cpu, idx, -1); +} + +int schedtune_cpu_boost(int cpu) +{ + struct boost_groups *bg; + + bg = &per_cpu(cpu_boost_groups, cpu); + return bg->boost_max; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h new file mode 100644 index 000000000000..561b5171a19b --- /dev/null +++ b/kernel/sched/tune.h @@ -0,0 +1,23 @@ + +#ifdef CONFIG_SCHED_TUNE + +#ifdef CONFIG_CGROUP_SCHEDTUNE + +int schedtune_cpu_boost(int cpu); + +void schedtune_enqueue_task(struct task_struct *p, int cpu); +void schedtune_dequeue_task(struct task_struct *p, int cpu); + +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + +#else /* CONFIG_SCHED_TUNE */ + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_SCHED_TUNE */ From a8f65584fc948d646732c0f9533570f97091c78d Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:31:53 +0000 Subject: [PATCH 0109/3220] sched/fair: add boosted task utilization The task utilization signal, which is derived from PELT signals and properly scaled to be architecture and frequency invariant, is used by EAS as an estimation of the task requirements in terms of CPU bandwidth. When the energy aware scheduler is in use, this signal affects the CPU selection. Thus, a convenient way to bias that decision, which is also little intrusive, is to boost the task utilization signal each time it is required to support them. This patch introduces the new function: boosted_task_util(task) which returns a boosted value for the utilization of the specified task. The margin added to the original utilization is: 1. computed based on the "boosting strategy" in use 2. proportional to boost value defined either by the sysctl interface, when global boosting is in use, or the "taskgroup" value, when per-task boosting is enabled. The boosted signal is used by EAS a. transparently, via its integration into the task_fits() function b. explicitly, in the energy-aware wakeup path Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++-- kernel/sched/tune.c | 14 ++++++++++++++ kernel/sched/tune.h | 1 + 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2dbe1ff0a90b..f582d58daea5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5048,11 +5048,13 @@ static inline unsigned long task_util(struct task_struct *p) unsigned int capacity_margin = 1280; /* ~20% margin */ +static inline unsigned long boosted_task_util(struct task_struct *task); + static inline bool __task_fits(struct task_struct *p, int cpu, int util) { unsigned long capacity = capacity_of(cpu); - util += task_util(p); + util += boosted_task_util(p); return (capacity * 1024) > (util * capacity_margin); } @@ -5133,6 +5135,27 @@ schedtune_cpu_margin(unsigned long util, int cpu) return schedtune_margin(util, boost); } +static inline unsigned long +schedtune_task_margin(struct task_struct *task) +{ + unsigned int boost; + unsigned long util; + unsigned long margin; + +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return 0; + + util = task_util(task); + margin = schedtune_margin(util, boost); + + return margin; +} + #else /* CONFIG_SCHED_TUNE */ static inline unsigned int @@ -5141,6 +5164,12 @@ schedtune_cpu_margin(unsigned long util, int cpu) return 0; } +static inline unsigned int +schedtune_task_margin(struct task_struct *task) +{ + return 0; +} + #endif /* CONFIG_SCHED_TUNE */ static inline unsigned long @@ -5152,6 +5181,15 @@ boosted_cpu_util(int cpu) return util + margin; } +static inline unsigned long +boosted_task_util(struct task_struct *task) +{ + unsigned long util = task_util(task); + unsigned long margin = schedtune_task_margin(task); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5386,7 +5424,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - int new_util = cpu_util(i) + task_util(p); + int new_util = cpu_util(i) + boosted_task_util(p); if (new_util > capacity_orig_of(i)) continue; diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 540b945a01ce..87213861bde5 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -240,6 +240,20 @@ int schedtune_cpu_boost(int cpu) return bg->boost_max; } +int schedtune_task_boost(struct task_struct *p) +{ + struct schedtune *st; + int task_boost; + + /* Get task boost value */ + rcu_read_lock(); + st = task_schedtune(p); + task_boost = st->boost; + rcu_read_unlock(); + + return task_boost; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 561b5171a19b..d756ce7b06e0 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -4,6 +4,7 @@ #ifdef CONFIG_CGROUP_SCHEDTUNE int schedtune_cpu_boost(int cpu); +int schedtune_task_boost(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); From 36967b2412fa7f356308c5104d95574e84ca03ef Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:35:13 +0000 Subject: [PATCH 0110/3220] sched/fair: keep track of energy/capacity variations The current EAS implementation does not allow "to boost" tasks performances, for example by running them at an higher OPP (or a more capable CPU), even if that could require a "reasonable" increase in energy consumption. To defined how much reasonable is an energy increase with respect to a required boost value, it is required to define and compute a trade-off between the expected energy and performance variations. However, the current EAS implementation considers only energy variations while completely disregard the impact on performance for the selection of a certain schedule candidate. This patch extends the eenv energy environment to keep track of both energy and performance deltas which are implied by the activation of a schedule candidate. The performance variation is estimated considering the different capacities of the CPUs in which the task could be scheduled. The idea is that while running on a CPU with higher capacity (e.g. higher operating point) the task could (potentially) complete faster and thus get better performance. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f582d58daea5..3b80ad01aa8b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4706,6 +4706,16 @@ struct energy_env { int src_cpu; int dst_cpu; int energy; + struct { + int before; + int after; + int diff; + } nrg; + struct { + int before; + int after; + int delta; + } cap; }; /* @@ -4872,6 +4882,22 @@ static int sched_group_energy(struct energy_env *eenv) eenv->sg_cap = sg; cap_idx = find_new_capacity(eenv, sg->sge); + + if (sg->group_weight == 1) { + /* Remove capacity of src CPU (before task move) */ + if (eenv->util_delta == 0 && + cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { + eenv->cap.before = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta -= eenv->cap.before; + } + /* Add capacity of dst CPU (after task move) */ + if (eenv->util_delta != 0 && + cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { + eenv->cap.after = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta += eenv->cap.after; + } + } + idle_idx = group_idle_state(sg); group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) @@ -4920,6 +4946,8 @@ static int energy_diff(struct energy_env *eenv) .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .nrg = { 0, 0, 0 }, + .cap = { 0, 0, 0 }, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -4941,13 +4969,21 @@ static int energy_diff(struct energy_env *eenv) return 0; /* Invalid result abort */ energy_before += eenv_before.energy; + /* Keep track of SRC cpu (before) capacity */ + eenv->cap.before = eenv_before.cap.before; + eenv->cap.delta = eenv_before.cap.delta; + if (sched_group_energy(eenv)) return 0; /* Invalid result abort */ energy_after += eenv->energy; } } while (sg = sg->next, sg != sd->groups); - return energy_after-energy_before; + eenv->nrg.before = energy_before; + eenv->nrg.after = energy_after; + eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + + return eenv->nrg.diff; } /* From 637ee3715a1a22c07995be181c0e2fca10d2df2a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 12 Jan 2016 18:12:13 +0000 Subject: [PATCH 0111/3220] sched/tune: add support to compute normalized energy The current EAS implementation considers only energy variations, while it disregards completely the impact on performance for the selection of a certain schedule candidate. Moreover, it also makes its decision based on the "absolute" value of expected energy variations. In order to properly define a trade-off strategy between increased energy consumption and performances benefits it is required to compare energy variations with performance variations. Thus, both performance and energy metrics must be expressed in comparable units. While the performance variations are expressed in terms of capacity deltas, which are defined in the range [0..SCHED_LOAD_SCALE], the same scale is not used for energy variations. This patch introduces the function: schedtune_normalize_energy(energy_diff) which returns a normalized value in the same range of capacity variations, i.e. [0..SCHED_LOAD_SCALE]. A proper set of energy normalization constants are required to provide a fast division by a constant during the normalziation of the energy_diff. The value of these constants depends on the specific energy model and topology of a target device. Thus, this patch provides also the required support for the computation at boot time of this set of variables. Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 321 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 7 + 2 files changed, 328 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 87213861bde5..1a8ba5a6d99b 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -1,7 +1,9 @@ #include #include +#include #include #include +#include #include #include @@ -9,6 +11,84 @@ unsigned int sysctl_sched_cfs_boost __read_mostly; +/* + * System energy normalization constants + */ +static struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +} schedtune_target_nrg; + +/* Performance Boost region (B) threshold params */ +static int perf_boost_idx; + +/* Performance Constraint region (C) threshold params */ +static int perf_constrain_idx; + +/** + * Performance-Energy (P-E) Space thresholds constants + */ +struct threshold_params { + int nrg_gain; + int cap_gain; +}; + +/* + * System specific P-E space thresholds constants + */ +static struct threshold_params +threshold_gains[] = { + { 0, 4 }, /* >= 0% */ + { 0, 4 }, /* >= 10% */ + { 1, 4 }, /* >= 20% */ + { 2, 4 }, /* >= 30% */ + { 3, 4 }, /* >= 40% */ + { 4, 3 }, /* >= 50% */ + { 4, 2 }, /* >= 60% */ + { 4, 1 }, /* >= 70% */ + { 4, 0 }, /* >= 80% */ + { 4, 0 } /* >= 90% */ +}; + +static int +__schedtune_accept_deltas(int nrg_delta, int cap_delta, + int perf_boost_idx, int perf_constrain_idx) +{ + int payoff = -INT_MAX; + + /* Performance Boost (B) region */ + if (nrg_delta > 0 && cap_delta > 0) { + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * payoff criteria: + * cap_delta / nrg_delta < cap_gain / nrg_gain + * which is: + * nrg_delta * cap_gain > cap_delta * nrg_gain + */ + payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; + payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; + return payoff; + } + + /* Performance Constraint (C) region */ + if (nrg_delta < 0 && cap_delta < 0) { + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * payoff criteria: + * cap_delta / nrg_delta > cap_gain / nrg_gain + * which is: + * cap_delta * nrg_gain > nrg_delta * cap_gain + */ + payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; + payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; + return payoff; + } + + /* Default: reject schedule candidate */ + return payoff; +} + #ifdef CONFIG_CGROUP_SCHEDTUNE /* @@ -26,6 +106,11 @@ struct schedtune { /* Boost value for tasks on that SchedTune CGroup */ int boost; + /* Performance Boost (B) region threshold params */ + int perf_boost_idx; + + /* Performance Constraint (C) region threshold params */ + int perf_constrain_idx; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -55,8 +140,37 @@ static inline struct schedtune *parent_st(struct schedtune *st) static struct schedtune root_schedtune = { .boost = 0, + .perf_boost_idx = 0, + .perf_constrain_idx = 0, }; +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + struct schedtune *ct; + int perf_boost_idx; + int perf_constrain_idx; + + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) + return INT_MAX; + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) + return -INT_MAX; + + /* Get task specific perf Boost/Constraints indexes */ + rcu_read_lock(); + ct = task_schedtune(task); + perf_boost_idx = ct->perf_boost_idx; + perf_constrain_idx = ct->perf_constrain_idx; + rcu_read_unlock(); + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + /* * Maximum number of boost groups to support * When per-task boosting is used we still allow only limited number of @@ -396,6 +510,24 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .early_init = 1, }; +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) + return INT_MAX; + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) + return -INT_MAX; + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + #endif /* CONFIG_CGROUP_SCHEDTUNE */ int @@ -408,5 +540,194 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* Performance Boost (B) region threshold params */ + perf_boost_idx = sysctl_sched_cfs_boost; + perf_boost_idx /= 10; + + /* Performance Constraint (C) region threshold params */ + perf_constrain_idx = 100 - sysctl_sched_cfs_boost; + perf_constrain_idx /= 10; + return 0; } + +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * corresponding to the specified energy variation. + */ +int +schedtune_normalize_energy(int energy_diff) +{ + u32 normalized_nrg; + int max_delta; + +#ifdef CONFIG_SCHED_DEBUG + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_LOAD_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +#ifdef CONFIG_SCHED_DEBUG +static void +schedtune_test_nrg(unsigned long delta_pwr) +{ + unsigned long test_delta_pwr; + unsigned long test_norm_pwr; + int idx; + + /* + * Check normalization constants using some constant system + * energy values + */ + pr_info("schedtune: verify normalization constants...\n"); + for (idx = 0; idx < 6; ++idx) { + test_delta_pwr = delta_pwr >> idx; + + /* Normalize on max energy for target platform */ + test_norm_pwr = reciprocal_divide( + test_delta_pwr << SCHED_LOAD_SHIFT, + schedtune_target_nrg.rdiv); + + pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n", + idx, test_delta_pwr, test_norm_pwr); + } +} +#else +#define schedtune_test_nrg(delta_pwr) +#endif + +/* + * Compute the min/max power consumption of a cluster and all its CPUs + */ +static void +schedtune_add_cluster_nrg( + struct sched_domain *sd, + struct sched_group *sg, + struct target_nrg *ste) +{ + struct sched_domain *sd2; + struct sched_group *sg2; + + struct cpumask *cluster_cpus; + char str[32]; + + unsigned long min_pwr; + unsigned long max_pwr; + int cpu; + + /* Get Cluster energy using EM data for the first CPU */ + cluster_cpus = sched_group_cpus(sg); + snprintf(str, 32, "CLUSTER[%*pbl]", + cpumask_pr_args(cluster_cpus)); + + min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power; + max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power; + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Keep track of this cluster's energy in the computation of the + * overall system energy + */ + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + /* Get CPU energy using EM data for each CPU in the group */ + for_each_cpu(cpu, cluster_cpus) { + /* Get a SD view for the specific CPU */ + for_each_domain(cpu, sd2) { + /* Get the CPU group */ + sg2 = sd2->groups; + min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power; + max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power; + + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + snprintf(str, 32, "CPU[%d]", cpu); + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Assume we have EM data only at the CPU and + * the upper CLUSTER level + */ + BUG_ON(!cpumask_equal( + sched_group_cpus(sg), + sched_group_cpus(sd2->parent->groups) + )); + break; + } + } +} + +/* + * Initialize the constants required to compute normalized energy. + * The values of these constants depends on the EM data for the specific + * target system and topology. + * Thus, this function is expected to be called by the code + * that bind the EM to the topology information. + */ +static int +schedtune_init_late(void) +{ + struct target_nrg *ste = &schedtune_target_nrg; + unsigned long delta_pwr = 0; + struct sched_domain *sd; + struct sched_group *sg; + + pr_info("schedtune: init normalization constants...\n"); + ste->max_power = 0; + ste->min_power = 0; + + rcu_read_lock(); + + /* + * When EAS is in use, we always have a pointer to the highest SD + * which provides EM data. + */ + sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask))); + if (!sd) { + pr_info("schedtune: no energy model data\n"); + goto nodata; + } + + sg = sd->groups; + do { + schedtune_add_cluster_nrg(sd, sg, ste); + } while (sg = sg->next, sg != sd->groups); + + rcu_read_unlock(); + + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + "SYSTEM", ste->min_power, ste->max_power); + + /* Compute normalization constants */ + delta_pwr = ste->max_power - ste->min_power; + ste->rdiv = reciprocal_value(delta_pwr); + pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n", + ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); + + schedtune_test_nrg(delta_pwr); + return 0; + +nodata: + rcu_read_unlock(); + return -EINVAL; +} +late_initcall(schedtune_init_late); diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index d756ce7b06e0..f7273a5d994a 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -16,9 +16,16 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #endif /* CONFIG_CGROUP_SCHEDTUNE */ +int schedtune_normalize_energy(int energy); +int schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task); + #else /* CONFIG_SCHED_TUNE */ #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) +#define schedtune_normalize_energy(energy) energy +#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta + #endif /* CONFIG_SCHED_TUNE */ From a2a6dc750833243b3b49d016291ae1133196759a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 15 Jan 2016 15:48:03 +0000 Subject: [PATCH 0112/3220] sched/fair: filter energy_diff() based on energy_payoff value Once the SchedTune support is enabled and the CPU bandwidth demand of a task is boosted, we could expect increased energy consumptions which are balanced by corresponding increases of tasks performance. However, the current implementation of the energy_diff() function accepts all and _only_ the schedule candidates which results into a reduced expected system energy, which works against the boosting strategy. This patch links the energy_diff() function with the "energy payoff" engine provided by SchedTune. The energy variation computed by the energy_diff() function is now filtered using the SchedTune support to evaluated the energy payoff for a boosted task. With that patch, the energy_diff() function is going to reported as "acceptable schedule candidate" only the schedule candidate which corresponds to a positive energy_payoff. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3b80ad01aa8b..1191c4b1b119 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4706,9 +4706,12 @@ struct energy_env { int src_cpu; int dst_cpu; int energy; + int payoff; + struct task_struct *task; struct { int before; int after; + int delta; int diff; } nrg; struct { @@ -4929,6 +4932,44 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +#ifdef CONFIG_SCHED_TUNE +static int energy_diff_evaluate(struct energy_env *eenv) +{ + unsigned int boost; + int nrg_delta; + + /* Return energy diff when boost margin is 0 */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(eenv->task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = schedtune_normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff_evaluate(eenv) eenv->nrg.diff +#endif + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -4946,7 +4987,7 @@ static int energy_diff(struct energy_env *eenv) .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, - .nrg = { 0, 0, 0 }, + .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, }; @@ -4982,8 +5023,9 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.before = energy_before; eenv->nrg.after = energy_after; eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + eenv->payoff = 0; - return eenv->nrg.diff; + return energy_diff_evaluate(eenv); } /* @@ -5481,6 +5523,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) .util_delta = task_util(p), .src_cpu = task_cpu(p), .dst_cpu = target_cpu, + .task = p, }; /* Not enough spare capacity on previous cpu */ From 99ed4e57cb4231b2561ec4a6433722dcd4a19a9e Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:06:24 +0000 Subject: [PATCH 0113/3220] DEBUG: sched: add tracepoint for cpu/freq scale invariance Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 24 ++++++++++++++++++++++++ kernel/sched/fair.c | 1 + 2 files changed, 25 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9b90c57517a9..20a7216f8abb 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -562,6 +562,30 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); + +TRACE_EVENT(sched_contrib_scale_f, + + TP_PROTO(int cpu, unsigned long freq_scale_factor, + unsigned long cpu_scale_factor), + + TP_ARGS(cpu, freq_scale_factor, cpu_scale_factor), + + TP_STRUCT__entry( + __field(int, cpu) + __field(unsigned long, freq_scale_factor) + __field(unsigned long, cpu_scale_factor) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->freq_scale_factor = freq_scale_factor; + __entry->cpu_scale_factor = cpu_scale_factor; + ), + + TP_printk("cpu=%d freq_scale_factor=%lu cpu_scale_factor=%lu", + __entry->cpu, __entry->freq_scale_factor, + __entry->cpu_scale_factor) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1191c4b1b119..ec2e8aecc4f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2587,6 +2587,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu); /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; From 8017fd7418dc34d55d0e8bfc83e6c0ec10909c58 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:07:27 +0000 Subject: [PATCH 0114/3220] DEBUG: sched: add tracepoint for task load/util signals Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 43 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 3 +++ 2 files changed, 46 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 20a7216f8abb..99f3e648c197 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -586,6 +586,49 @@ TRACE_EVENT(sched_contrib_scale_f, __entry->cpu, __entry->freq_scale_factor, __entry->cpu_scale_factor) ); + +/* + * Tracepoint for accounting sched averages for tasks. + */ +TRACE_EVENT(sched_load_avg_task, + + TP_PROTO(struct task_struct *tsk, struct sched_avg *avg), + + TP_ARGS(tsk, avg), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( unsigned long, util_avg ) + __field( u64, load_sum ) + __field( u32, util_sum ) + __field( u32, period_contrib ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = task_cpu(tsk); + __entry->load_avg = avg->load_avg; + __entry->util_avg = avg->util_avg; + __entry->load_sum = avg->load_sum; + __entry->util_sum = avg->util_sum; + __entry->period_contrib = avg->period_contrib; + ), + + TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu" + " util_sum=%u period_contrib=%u", + __entry->comm, + __entry->pid, + __entry->cpu, + __entry->load_avg, + __entry->util_avg, + (u64)__entry->load_sum, + (u32)__entry->util_sum, + (u32)__entry->period_contrib) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ec2e8aecc4f3..8cdb4e5592f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2731,6 +2731,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (entity_is_task(se)) + trace_sched_load_avg_task(task_of(se), &se->avg); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From e5a25996d3a3f46f3abee77f868b922903f5f60e Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:07:48 +0000 Subject: [PATCH 0115/3220] DEBUG: sched: add tracepoint for CPU load/util signals Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 25 +++++++++++++++++++++++++ kernel/sched/fair.c | 1 + 2 files changed, 26 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 99f3e648c197..f58760c2f712 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -629,6 +629,31 @@ TRACE_EVENT(sched_load_avg_task, (u32)__entry->util_sum, (u32)__entry->period_contrib) ); + +/* + * Tracepoint for accounting sched averages for cpus. + */ +TRACE_EVENT(sched_load_avg_cpu, + + TP_PROTO(int cpu, struct cfs_rq *cfs_rq), + + TP_ARGS(cpu, cfs_rq), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( unsigned long, util_avg ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->load_avg = cfs_rq->avg.load_avg; + __entry->util_avg = cfs_rq->avg.util_avg; + ), + + TP_printk("cpu=%d load_avg=%lu util_avg=%lu", + __entry->cpu, __entry->load_avg, __entry->util_avg) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8cdb4e5592f9..b5ea9ec335d8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2734,6 +2734,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (entity_is_task(se)) trace_sched_load_avg_task(task_of(se), &se->avg); + trace_sched_load_avg_cpu(cpu, cfs_rq); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From dd2460f3872f1b89839aedb21b999e802542c32b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 30 Apr 2015 17:35:23 +0100 Subject: [PATCH 0116/3220] DEBUG: sched,cpufreq: add cpu_capacity change tracepoint This is useful when we want to compare cpu utilization and cpu curr capacity side by side. Signed-off-by: Juri Lelli --- drivers/cpufreq/cpufreq.c | 4 ++++ include/linux/sched.h | 2 ++ include/trace/events/power.h | 7 +++++++ kernel/sched/fair.c | 11 +++++++++++ kernel/sched/sched.h | 11 ----------- 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 5ca8e9517aae..0f30d11fb528 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -29,6 +29,7 @@ #include #include #include +#include #include static LIST_HEAD(cpufreq_policy_list); @@ -473,6 +474,7 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy, void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { + int cpu; /* * Catch double invocations of _begin() which lead to self-deadlock. @@ -501,6 +503,8 @@ wait: spin_unlock(&policy->transition_lock); scale_freq_capacity(policy, freqs); + for_each_cpu(cpu, policy->cpus) + trace_cpu_capacity(capacity_curr_of(cpu), cpu); cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 1460c6a4f65f..b9b11b2dbabd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1046,6 +1046,8 @@ struct sched_group_energy { struct capacity_state *cap_states; /* ptr to capacity state array */ }; +unsigned long capacity_curr_of(int cpu); + struct sched_group; struct sched_domain { diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244ebfe8d..f4be04e44252 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -120,6 +120,13 @@ DEFINE_EVENT(cpu, cpu_frequency, TP_ARGS(frequency, cpu_id) ); +DEFINE_EVENT(cpu, cpu_capacity, + + TP_PROTO(unsigned int capacity, unsigned int cpu_id), + + TP_ARGS(capacity, cpu_id) +); + TRACE_EVENT(device_pm_callback_start, TP_PROTO(struct device *dev, const char *pm_ops, int event), diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b5ea9ec335d8..9b74b86aba3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4698,6 +4698,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 983c132af11b..db3f3db9849c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1512,17 +1512,6 @@ static inline unsigned long cpu_util(int cpu) return __cpu_util(cpu, 0); } -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -static inline unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHED From 0a0f4aa12f6ec50e8b5d917b773e29702714acad Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:25:50 +0000 Subject: [PATCH 0117/3220] DEBUG: sched: add energy procfs interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch makes the energy data available via procfs. The related files are placed as sub-directory named 'energy' inside the /proc/sys/kernel/sched_domain/cpuX/domainY/groupZ directory for those cpu/domain/group tuples which have energy information. The following example depicts the contents of /proc/sys/kernel/sched_domain/cpu0/domain0/group[01] for a system which has energy information attached to domain level 0. ├── cpu0 │ ├── domain0 │ │ ├── busy_factor │ │ ├── busy_idx │ │ ├── cache_nice_tries │ │ ├── flags │ │ ├── forkexec_idx │ │ ├── group0 │ │ │ └── energy │ │ │ ├── cap_states │ │ │ ├── idle_states │ │ │ ├── nr_cap_states │ │ │ └── nr_idle_states │ │ ├── group1 │ │ │ └── energy │ │ │ ├── cap_states │ │ │ ├── idle_states │ │ │ ├── nr_cap_states │ │ │ └── nr_idle_states │ │ ├── idle_idx │ │ ├── imbalance_pct │ │ ├── max_interval │ │ ├── max_newidle_lb_cost │ │ ├── min_interval │ │ ├── name │ │ ├── newidle_idx │ │ └── wake_idx │ └── domain1 │ ├── busy_factor │ ├── busy_idx │ ├── cache_nice_tries │ ├── flags │ ├── forkexec_idx │ ├── idle_idx │ ├── imbalance_pct │ ├── max_interval │ ├── max_newidle_lb_cost │ ├── min_interval │ ├── name │ ├── newidle_idx │ └── wake_idx The files 'nr_idle_states' and 'nr_cap_states' contain a scalar value whereas 'idle_states' and 'cap_states' contain a vector of power consumption at this idle state respectively (compute capacity, power consumption) at this capacity state. Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 151ed9c62ad2..9e25e68747d7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5412,10 +5412,61 @@ set_table_entry(struct ctl_table *entry, } } +static struct ctl_table * +sd_alloc_ctl_energy_table(struct sched_group_energy *sge) +{ + struct ctl_table *table = sd_alloc_ctl_entry(5); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power, + sge->nr_idle_states*sizeof(struct idle_state), 0644, + proc_doulongvec_minmax, false); + set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap, + sge->nr_cap_states*sizeof(struct capacity_state), 0644, + proc_doulongvec_minmax, false); + + return table; +} + +static struct ctl_table * +sd_alloc_ctl_group_table(struct sched_group *sg) +{ + struct ctl_table *table = sd_alloc_ctl_entry(2); + + if (table == NULL) + return NULL; + + table->procname = kstrdup("energy", GFP_KERNEL); + table->mode = 0555; + table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge); + + return table; +} + static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table; + unsigned int nr_entries = 14; + + int i = 0; + struct sched_group *sg = sd->groups; + + if (sg->sge) { + int nr_sgs = 0; + + do {} while (nr_sgs++, sg = sg->next, sg != sd->groups); + + nr_entries += nr_sgs; + } + + table = sd_alloc_ctl_entry(nr_entries); if (table == NULL) return NULL; @@ -5448,7 +5499,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ + sg = sd->groups; + if (sg->sge) { + char buf[32]; + struct ctl_table *entry = &table[13]; + + do { + snprintf(buf, 32, "group%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_group_table(sg); + } while (entry++, i++, sg = sg->next, sg != sd->groups); + } + /* &table[nr_entries-1] is terminator */ return table; } From c8a65d2e9a3a6ba9d4d7b1634701dd34017f4107 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 13:49:07 +0100 Subject: [PATCH 0118/3220] DEBUG: schedtune: add tracepoint for SchedTune configuration update Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 21 +++++++++++++++++++++ kernel/sched/tune.c | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index f58760c2f712..8ae43a2ebfda 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -654,6 +654,27 @@ TRACE_EVENT(sched_load_avg_cpu, TP_printk("cpu=%d load_avg=%lu util_avg=%lu", __entry->cpu, __entry->load_avg, __entry->util_avg) ); + +/* + * Tracepoint for sched_tune_config settings + */ +TRACE_EVENT(sched_tune_config, + + TP_PROTO(int boost), + + TP_ARGS(boost), + + TP_STRUCT__entry( + __field( int, boost ) + ), + + TP_fast_assign( + __entry->boost = boost; + ), + + TP_printk("boost=%d ", __entry->boost) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 1a8ba5a6d99b..f5f4c57efb9e 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -7,6 +7,8 @@ #include #include +#include + #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; @@ -392,6 +394,8 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); + trace_sched_tune_config(st->boost); + return 0; } From a09a25c5dfc81d08df7b3c2e76ea94a651c17ac2 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 13:51:07 +0100 Subject: [PATCH 0119/3220] DEBUG: schedtune: add tracepoint for CPU boost signal Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 27 +++++++++++++++++++++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 29 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8ae43a2ebfda..eb812071458b 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -675,6 +675,33 @@ TRACE_EVENT(sched_tune_config, TP_printk("boost=%d ", __entry->boost) ); +/* + * Tracepoint for accounting CPU boosted utilization + */ +TRACE_EVENT(sched_boost_cpu, + + TP_PROTO(int cpu, unsigned long util, unsigned long margin), + + TP_ARGS(cpu, util, margin), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, util ) + __field( unsigned long, margin ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("cpu=%d util=%lu margin=%lu", + __entry->cpu, + __entry->util, + __entry->margin) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b74b86aba3a..3bc56e0ea1ab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5272,6 +5272,8 @@ boosted_cpu_util(int cpu) unsigned long util = cpu_util(cpu); unsigned long margin = schedtune_cpu_margin(util, cpu); + trace_sched_boost_cpu(cpu, util, margin); + return util + margin; } From fed95d9ef1b2674c4a2cda8b2445ca7ef3f6ca59 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Jun 2015 15:36:08 +0100 Subject: [PATCH 0120/3220] DEBUG: schedtune: add tracepoint for schedtune_tasks_update() values Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 62 ++++++++++++++++++++++++++++++++++++ kernel/sched/tune.c | 12 ++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index eb812071458b..7ec9dcfc701a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -702,6 +702,68 @@ TRACE_EVENT(sched_boost_cpu, __entry->margin) ); +/* + * Tracepoint for schedtune_tasks_update + */ +TRACE_EVENT(sched_tune_tasks_update, + + TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, + unsigned int boost, unsigned int max_boost), + + TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( int, tasks ) + __field( int, idx ) + __field( unsigned int, boost ) + __field( unsigned int, max_boost ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = cpu; + __entry->tasks = tasks; + __entry->idx = idx; + __entry->boost = boost; + __entry->max_boost = max_boost; + ), + + TP_printk("pid=%d comm=%s " + "cpu=%d tasks=%d idx=%d boost=%u max_boost=%u", + __entry->pid, __entry->comm, + __entry->cpu, __entry->tasks, __entry->idx, + __entry->boost, __entry->max_boost) +); + +/* + * Tracepoint for schedtune_boostgroup_update + */ +TRACE_EVENT(sched_tune_boostgroup_update, + + TP_PROTO(int cpu, int variation, int max_boost), + + TP_ARGS(cpu, variation, max_boost), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( int, variation ) + __field( int, max_boost ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->variation = variation; + __entry->max_boost = max_boost; + ), + + TP_printk("cpu=%d variation=%d max_boost=%d", + __entry->cpu, __entry->variation, __entry->max_boost) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index f5f4c57efb9e..7a434f2394e7 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -264,12 +264,18 @@ schedtune_boostgroup_update(int idx, int boost) /* Check if this update increase current max */ if (boost > cur_boost_max && bg->group[idx].tasks) { bg->boost_max = boost; + trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); continue; } /* Check if this update has decreased current max */ - if (cur_boost_max == old_boost && old_boost > boost) + if (cur_boost_max == old_boost && old_boost > boost) { schedtune_cpu_update(cpu); + trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); + continue; + } + + trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max); } return 0; @@ -293,6 +299,10 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) tasks = bg->group[idx].tasks; if (tasks == 1 || tasks == 0) schedtune_cpu_update(cpu); + + trace_sched_tune_tasks_update(p, cpu, tasks, idx, + bg->group[idx].boost, bg->boost_max); + } /* From e3b53389cf2313172034123a84d760984c4efd71 Mon Sep 17 00:00:00 2001 From: Lianwei Wang Date: Fri, 6 May 2016 00:17:57 -0700 Subject: [PATCH 0121/3220] Revert "drivers: power: Add watchdog timer to catch drivers which lockup during suspend." This reverts commit ad86cc8ad63229eeeba0628e99f2f59df55a25fd. Commit 70fea60d888d ("PM / Sleep: Detect device suspend/resume lockup...") added a suspend/resume watchdog timer to catch the lockup. Let's revert the duplicate one. Change-Id: Ic72a87432e27844155467817600adc6cf0c2209c Signed-off-by: Lianwei Wang Signed-off-by: Amit Pundir --- drivers/base/power/main.c | 43 --------------------------------------- 1 file changed, 43 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 6ed8b9326629..2f7a6e1b568c 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -62,12 +62,6 @@ struct suspend_stats suspend_stats; static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; -static void dpm_drv_timeout(unsigned long data); -struct dpm_drv_wd_data { - struct device *dev; - struct task_struct *tsk; -}; - static int async_error; static char *pm_verb(int event) @@ -838,30 +832,6 @@ static void async_resume(void *data, async_cookie_t cookie) put_device(dev); } -/** - * dpm_drv_timeout - Driver suspend / resume watchdog handler - * @data: struct device which timed out - * - * Called when a driver has timed out suspending or resuming. - * There's not much we can do here to recover so - * BUG() out for a crash-dump - * - */ -static void dpm_drv_timeout(unsigned long data) -{ - struct dpm_drv_wd_data *wd_data = (void *)data; - struct device *dev = wd_data->dev; - struct task_struct *tsk = wd_data->tsk; - - printk(KERN_EMERG "**** DPM device timeout: %s (%s)\n", dev_name(dev), - (dev->driver ? dev->driver->name : "no driver")); - - printk(KERN_EMERG "dpm suspend stack:\n"); - show_stack(tsk, NULL); - - BUG(); -} - /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. @@ -1380,8 +1350,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) pm_callback_t callback = NULL; char *info = NULL; int error = 0; - struct timer_list timer; - struct dpm_drv_wd_data data; char suspend_abort[MAX_SUSPEND_ABORT_LEN]; DECLARE_DPM_WATCHDOG_ON_STACK(wd); @@ -1412,14 +1380,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) if (dev->power.syscore) goto Complete; - - data.dev = dev; - data.tsk = current; - init_timer_on_stack(&timer); - timer.expires = jiffies + HZ * 12; - timer.function = dpm_drv_timeout; - timer.data = (unsigned long)&data; - add_timer(&timer); if (dev->power.direct_complete) { if (pm_runtime_status_suspended(dev)) { @@ -1500,9 +1460,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) device_unlock(dev); dpm_watchdog_clear(&wd); - del_timer_sync(&timer); - destroy_timer_on_stack(&timer); - Complete: complete_all(&dev->power.completion); if (error) From e9253e876087accdab8edb763cfa4c3a48a8ce66 Mon Sep 17 00:00:00 2001 From: Jimmy Perchet Date: Mon, 9 May 2016 10:32:04 -0700 Subject: [PATCH 0122/3220] FROMLIST: wlcore: Disable filtering in AP role When you configure (set it up) a STA interface, the driver install a multicast filter. This is normal behavior, when one application subscribe to multicast address the filter is updated. When Access Point interface is configured, there is no filter installation and the "filter update" path is disabled in the driver. The problem happens when you switch an interface from STA type to AP type. The filter is installed but there are no means to update it. Change-Id: Ied22323af831575303abd548574918baa9852dd0 Signed-off-by: Dmitry Shmidt --- drivers/net/wireless/ti/wlcore/init.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/wireless/ti/wlcore/init.c b/drivers/net/wireless/ti/wlcore/init.c index e92f2639af2c..9fd3c6af0a61 100644 --- a/drivers/net/wireless/ti/wlcore/init.c +++ b/drivers/net/wireless/ti/wlcore/init.c @@ -549,6 +549,11 @@ static int wl12xx_init_ap_role(struct wl1271 *wl, struct wl12xx_vif *wlvif) { int ret; + /* Disable filtering */ + ret = wl1271_acx_group_address_tbl(wl, wlvif, false, NULL, 0); + if (ret < 0) + return ret; + ret = wl1271_acx_ap_max_tx_retry(wl, wlvif); if (ret < 0) return ret; From 92c4fc6f0946e846dee50b79b5dd061fe795dfaf Mon Sep 17 00:00:00 2001 From: Janis Danisevskis Date: Thu, 14 Apr 2016 13:57:03 +0100 Subject: [PATCH 0123/3220] UPSTREAM: procfs: fixes pthread cross-thread naming if !PR_DUMPABLE The PR_DUMPABLE flag causes the pid related paths of the proc file system to be owned by ROOT. The implementation of pthread_set/getname_np however needs access to /proc//task//comm. If PR_DUMPABLE is false this implementation is locked out. This patch installs a special permission function for the file "comm" that grants read and write access to all threads of the same group regardless of the ownership of the inode. For all other threads the function falls back to the generic inode permission check. Signed-off-by: Janis Danisevskis --- fs/proc/base.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 3b6962c52965..c21bb4beb0d7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3076,6 +3076,44 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; } +/* + * proc_tid_comm_permission is a special permission function exclusively + * used for the node /proc//task//comm. + * It bypasses generic permission checks in the case where a task of the same + * task group attempts to access the node. + * The rational behind this is that glibc and bionic access this node for + * cross thread naming (pthread_set/getname_np(!self)). However, if + * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0, + * which locks out the cross thread naming implementation. + * This function makes sure that the node is always accessible for members of + * same thread group. + */ +static int proc_tid_comm_permission(struct inode *inode, int mask) +{ + bool is_same_tgroup; + struct task_struct *task; + + task = get_proc_task(inode); + if (!task) + return -ESRCH; + is_same_tgroup = same_thread_group(current, task); + put_task_struct(task); + + if (likely(is_same_tgroup && !(mask & MAY_EXEC))) { + /* This file (/proc//task//comm) can always be + * read or written by the members of the corresponding + * thread group. + */ + return 0; + } + + return generic_permission(inode, mask); +} + +static const struct inode_operations proc_tid_comm_inode_operations = { + .permission = proc_tid_comm_permission, +}; + /* * Tasks */ @@ -3094,7 +3132,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif - REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), + NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, + &proc_tid_comm_inode_operations, + &proc_pid_set_comm_operations, {}), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif From 239dd543887daf9337567e68e4de7f42efa53a48 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Wed, 11 May 2016 11:01:02 -0700 Subject: [PATCH 0124/3220] fiq_debugger: Add fiq_debugger.disable option This change allows to use same kernel image with different console options for uart and fiq_debugger. If fiq_debugger.disable will be set to 1/y/Y, fiq_debugger will not be initialized. Change-Id: I71fda54f5f863d13b1437b1f909e52dd375d002d Signed-off-by: Dmitry Shmidt --- drivers/staging/android/fiq_debugger/fiq_debugger.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c index 0c558331a3d2..b132cff14f01 100644 --- a/drivers/staging/android/fiq_debugger/fiq_debugger.c +++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c @@ -123,11 +123,13 @@ static bool initial_console_enable; #endif static bool fiq_kgdb_enable; +static bool fiq_debugger_disable; module_param_named(no_sleep, initial_no_sleep, bool, 0644); module_param_named(debug_enable, initial_debug_enable, bool, 0644); module_param_named(console_enable, initial_console_enable, bool, 0644); module_param_named(kgdb_enable, fiq_kgdb_enable, bool, 0644); +module_param_named(disable, fiq_debugger_disable, bool, 0644); #ifdef CONFIG_FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON static inline @@ -1230,6 +1232,10 @@ int fiq_debugger_uart_overlay(void) static int __init fiq_debugger_init(void) { + if (fiq_debugger_disable) { + pr_err("serial_debugger: disabled\n"); + return -ENODEV; + } #if defined(CONFIG_FIQ_DEBUGGER_CONSOLE) fiq_debugger_tty_init(); #endif From cc0063b8eb44f08bb899fb47d562c67d73b7245b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 12 May 2016 11:17:52 -0700 Subject: [PATCH 0125/3220] xt_qtaguid: Fix panic caused by processing non-full socket. In an issue very similar to 4e461c777e3 (xt_qtaguid: Fix panic caused by synack processing), we were seeing panics on occasion in testing. In this case, it was the same issue, but caused by a different call path, as the sk being returned from qtaguid_find_sk() was not a full socket. Resulting in the sk->sk_socket deref to fail. This patch adds an extra check to ensure the sk being retuned is a full socket, and if not it returns NULL. Reported-by: Milosz Wasilewski Signed-off-by: John Stultz --- net/netfilter/xt_qtaguid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index 822dc3c3bce1..e2e7d54f9bb1 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1606,7 +1606,7 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb, * When in TCP_TIME_WAIT the sk is not a "struct sock" but * "struct inet_timewait_sock" which is missing fields. */ - if (sk->sk_state == TCP_TIME_WAIT) { + if (!sk_fullsock(sk) || sk->sk_state == TCP_TIME_WAIT) { sock_gen_put(sk); sk = NULL; } From d495067ae176cfd7a027d11c956fbacad009a4c7 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 16 May 2016 13:17:28 +0530 Subject: [PATCH 0126/3220] Revert "cpufreq: interactive: build fixes for 4.4" This reverts commit bc68f6c4efbd4ddbb15817203f18b7941d9ffd52. This build fix broke the Interactive Gov at runtime with duplicate sysfs entry warnings at boot time. We no longer need to this create/destroy cpufreq sysfs entry at run time on need basis thanks to upstream commit 8eec1020f0c0 (cpufreq: create cpu/cpufreq at boot time) which creates it at boot time. Hence drop this build fix. Signed-off-by: Amit Pundir --- drivers/cpufreq/cpufreq_interactive.c | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index bd83be39f17b..7f846b06e024 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -116,28 +116,6 @@ struct cpufreq_interactive_tunables { bool io_is_busy; }; -/* - * HACK: FIXME: Bring back cpufreq_{get,put}_global_kobject() - * definition removed by upstream commit 8eec1020f0c0 "cpufreq: - * create cpu/cpufreq at boot time" to fix build failures. - */ -static int cpufreq_global_kobject_usage; - -int cpufreq_get_global_kobject(void) -{ - if (!cpufreq_global_kobject_usage++) - return kobject_add(cpufreq_global_kobject, - &cpu_subsys.dev_root->kobj, "%s", "cpufreq"); - - return 0; -} - -void cpufreq_put_global_kobject(void) -{ - if (!--cpufreq_global_kobject_usage) - kobject_del(cpufreq_global_kobject); -} - /* For cases where we have single governor instance for system */ static struct cpufreq_interactive_tunables *common_tunables; From 287d9d0efbd378dc0d7919bc9820737c869f5c0b Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 16 May 2016 13:25:35 +0530 Subject: [PATCH 0127/3220] cpufreq: interactive: drop cpufreq_{get,put}_global_kobject func calls Upstream commit 8eec1020f0c0 (cpufreq: create cpu/cpufreq at boot time) make sure that cpufreq sysfs entry get created at boot time, and there is no need to create/destroy it on need basis anymore. So drop deprecated cpufreq_{get,put}_global_kobject function calls which otherwise result in following compilation errors: drivers/cpufreq/cpufreq_interactive.c: In function 'cpufreq_governor_interactive': drivers/cpufreq/cpufreq_interactive.c:1187:4: error: implicit declaration of function 'cpufreq_get_global_kobject' [-Werror=implicit-function-declaration] WARN_ON(cpufreq_get_global_kobject()); ^ drivers/cpufreq/cpufreq_interactive.c:1197:5: error: implicit declaration of function 'cpufreq_put_global_kobject'[-Werror=implicit-function-declaration] cpufreq_put_global_kobject(); ^ Signed-off-by: Amit Pundir --- drivers/cpufreq/cpufreq_interactive.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 7f846b06e024..f2929e628820 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -1184,7 +1184,6 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy, policy->governor_data = tunables; if (!have_governor_per_policy()) { common_tunables = tunables; - WARN_ON(cpufreq_get_global_kobject()); } rc = sysfs_create_group(get_governor_parent_kobj(policy), @@ -1194,7 +1193,6 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy, policy->governor_data = NULL; if (!have_governor_per_policy()) { common_tunables = NULL; - cpufreq_put_global_kobject(); } return rc; } @@ -1218,9 +1216,6 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy, sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); - if (!have_governor_per_policy()) - cpufreq_put_global_kobject(); - kfree(tunables); common_tunables = NULL; } From 9a326f6c084b089765ed8ee903b76a5e8251b0b3 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 17 May 2016 16:06:17 +0530 Subject: [PATCH 0128/3220] Revert "drivers: power: use 'current' instead of 'get_current()'" This reverts commit e1b5d103894d097fb630aebc3c1fdaf257f7c9bb. This patch fixed the aosp commit ad86cc8ad632 (drivers: power: Add watchdog timer to catch drivers which lockup during suspend.), which we dropped in Change Id Ic72a87432e27844155467817600adc6cf0c2209c, so we no longer need this fix. A part of this patch is already reverted in above mentioned Change Id. Signed-off-by: Amit Pundir --- drivers/base/power/main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 2f7a6e1b568c..5ea529165362 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -35,8 +35,6 @@ #include #include -#include - #include "../base.h" #include "power.h" From 1f95b0e9be77e4d72a6352b6fc486d0678def27f Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sun, 10 Jan 2016 22:40:55 -0800 Subject: [PATCH 0129/3220] UPSTREAM: tty: Fix unsafe ldisc reference via ioctl(TIOCGETD) (cherry pick from commit 5c17c861a357e9458001f021a7afa7aab9937439) ioctl(TIOCGETD) retrieves the line discipline id directly from the ldisc because the line discipline id (c_line) in termios is untrustworthy; userspace may have set termios via ioctl(TCSETS*) without actually changing the line discipline via ioctl(TIOCSETD). However, directly accessing the current ldisc via tty->ldisc is unsafe; the ldisc ptr dereferenced may be stale if the line discipline is changing via ioctl(TIOCSETD) or hangup. Wait for the line discipline reference (just like read() or write()) to retrieve the "current" line discipline id. Cc: Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman Bug: 28409131 Change-Id: I6774bd883a2e48bbe020486c72c42fb410e3f98a --- drivers/tty/tty_io.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index bcc8e1e8bb72..ab15b937fedb 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2652,6 +2652,28 @@ static int tiocsetd(struct tty_struct *tty, int __user *p) return ret; } +/** + * tiocgetd - get line discipline + * @tty: tty device + * @p: pointer to user data + * + * Retrieves the line discipline id directly from the ldisc. + * + * Locking: waits for ldisc reference (in case the line discipline + * is changing or the tty is being hungup) + */ + +static int tiocgetd(struct tty_struct *tty, int __user *p) +{ + struct tty_ldisc *ld; + int ret; + + ld = tty_ldisc_ref_wait(tty); + ret = put_user(ld->ops->num, p); + tty_ldisc_deref(ld); + return ret; +} + /** * send_break - performed time break * @tty: device to break on @@ -2878,7 +2900,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case TIOCGSID: return tiocgsid(tty, real_tty, p); case TIOCGETD: - return put_user(tty->ldisc->ops->num, (int __user *)p); + return tiocgetd(tty, p); case TIOCSETD: return tiocsetd(tty, p); case TIOCVHANGUP: From 8f2aea82ca669025419a4c9ab23b94f97c1ee170 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 4 Apr 2016 14:15:23 -0400 Subject: [PATCH 0130/3220] =?UTF-8?q?UPSTREAM:=20mac80211:=20fix=20"warnin?= =?UTF-8?q?g:=20=E2=80=98target=5Fmetric=E2=80=99=20may=20be=20used=20unin?= =?UTF-8?q?itialized"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (This cherry-picks b4201cc4fc6e1c57d6d306b1f787865043d60129 upstream) This fixes: net/mac80211/mesh_hwmp.c:603:26: warning: ‘target_metric’ may be used uninitialized in this function target_metric is only consumed when reply = true so no bug exists here, but not all versions of gcc realize it. Initialize to 0 to remove the warning. Change-Id: I13923fda9d314f48196c29e4354133dfe01f5abd Signed-off-by: Jeff Mahoney Signed-off-by: Johannes Berg [jstultz: Cherry-picked to android-4.4] Signed-off-by: John Stultz --- net/mac80211/mesh_hwmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index c6be0b4f4058..b6dc2d7cd650 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -530,7 +530,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, const u8 *target_addr, *orig_addr; const u8 *da; u8 target_flags, ttl, flags; - u32 orig_sn, target_sn, lifetime, target_metric; + u32 orig_sn, target_sn, lifetime, target_metric = 0; bool reply = false; bool forward = true; bool root_is_gate; From cba8d1f729d830a200055f0a4bab51883304ac9c Mon Sep 17 00:00:00 2001 From: Winter Wang Date: Fri, 20 May 2016 11:05:00 +0800 Subject: [PATCH 0131/3220] ANDROID: usb: gadget: f_midi: set fi->f to NULL when free f_midi function fi->f is set in f_midi's alloc_func, need to clean this to NULL in free_func, otherwise on ConfigFS's function switch, midi->usb_function it self is freed, fi->f will be a wild pointer and run into below kernel panic: --------------- [ 58.950628] Unable to handle kernel paging request at virtual address 63697664 [ 58.957869] pgd = c0004000 [ 58.960583] [63697664] *pgd=00000000 [ 58.964185] Internal error: Oops: 80000005 [#1] PREEMPT SMP ARM [ 58.970111] Modules linked in: [ 58.973191] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.1.15-03504-g34c857c-dirty #89 [ 58.981024] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 58.987557] task: c110bd70 ti: c1100000 task.ti: c1100000 [ 58.992962] PC is at 0x63697664 [ 58.996120] LR is at android_setup+0x78/0x138 <..snip..> [ 60.044980] 1fc0: ffffffff ffffffff c1000684 00000000 00000000 c108ecd0 c11f7294 c11039c0 [ 60.053181] 1fe0: c108eccc c110d148 1000406a 412fc09a 00000000 1000807c 00000000 00000000 [ 60.061420] [] (android_setup) from [] (udc_irq+0x758/0x1034) [ 60.068951] [] (udc_irq) from [] (handle_irq_event_percpu+0x50/0x254) [ 60.077165] [] (handle_irq_event_percpu) from [] (handle_irq_event+0x3c/0x5c) [ 60.086072] [] (handle_irq_event) from [] (handle_fasteoi_irq+0xe0/0x198) [ 60.094630] [] (handle_fasteoi_irq) from [] (generic_handle_irq+0x2c/0x3c) [ 60.103271] [] (generic_handle_irq) from [] (__handle_domain_irq+0x7c/0xec) [ 60.112000] [] (__handle_domain_irq) from [] (gic_handle_irq+0x24/0x5c) -------------- Signed-off-by: Winter Wang --- drivers/usb/gadget/function/f_midi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 8a0e7f988d25..847f70363477 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -1140,6 +1140,7 @@ static void f_midi_free(struct usb_function *f) for (i = opts->in_ports - 1; i >= 0; --i) kfree(midi->in_port[i]); kfree(midi); + opts->func_inst.f = NULL; --opts->refcnt; mutex_unlock(&opts->lock); } From c11dd134ea73ff9ee6bdb2807ac064cef2f284bd Mon Sep 17 00:00:00 2001 From: Haojian Zhuang Date: Fri, 22 Apr 2016 17:23:29 +0800 Subject: [PATCH 0132/3220] arm64: add option to build Image-dtb Some bootloaders couldn't decompress Image.gz-dtb. Change-Id: I698cd0c4ee6894e8d0655d88f3ecf4826c28a645 Signed-off-by: Haojian Zhuang Signed-off-by: John Stultz Signed-off-by: Dmitry Shmidt --- arch/arm64/Makefile | 2 +- arch/arm64/boot/Makefile | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index a5c84594f6ad..4e77fd165f38 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -106,7 +106,7 @@ dtbs: prepare scripts dtbs_install: $(Q)$(MAKE) $(dtbinst)=$(boot)/dts -Image.gz-dtb: vmlinux scripts dtbs +Image-dtb Image.gz-dtb: vmlinux scripts dtbs $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ PHONY += vdso_install diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile index ddd405f88344..7ab8e74cd83a 100644 --- a/arch/arm64/boot/Makefile +++ b/arch/arm64/boot/Makefile @@ -32,6 +32,9 @@ $(obj)/Image: vmlinux FORCE $(obj)/Image.bz2: $(obj)/Image FORCE $(call if_changed,bzip2) +$(obj)/Image-dtb: $(obj)/Image $(DTB_OBJS) FORCE + $(call if_changed,cat) + $(obj)/Image.gz: $(obj)/Image FORCE $(call if_changed,gzip) From 9f6f7a27247cbfa50c1c9ebdd8417e2006a18100 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 24 May 2016 14:41:57 -0700 Subject: [PATCH 0133/3220] ARM64: Ignore Image-dtb from git point of view Change-Id: I5bbf1db90f28ea956383b4a5d91ad508eea656dc Signed-off-by: Dmitry Shmidt --- arch/arm64/boot/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/.gitignore b/arch/arm64/boot/.gitignore index eb3551131b1e..34e35209fc2e 100644 --- a/arch/arm64/boot/.gitignore +++ b/arch/arm64/boot/.gitignore @@ -1,3 +1,4 @@ Image +Image-dtb Image.gz Image.gz-dtb From dbe3b633a70b928e8f7ca92c9eb6751ea4bd71bc Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 26 May 2016 12:57:56 +0530 Subject: [PATCH 0134/3220] Revert "arm: dcc_tty: fix armv6 dcc tty build failure" This reverts commit dfc1d4be88597141f5ad9d39908c13944d209009. Drop AOSP's "armv6 dcc tty driver" in favor of upstream DCC driver for ARMv6/v7 16c63f8ea49c (drivers: char: hvc: add arm JTAG DCC console support) and for ARMv8 4cad4c57e0b3 (ARM64: TTY: hvc_dcc: Add support for ARM64 dcc). Change-Id: I8110a4fd649b8ac1ec9bfac00255c1214135e4b2 Signed-off-by: Amit Pundir --- drivers/char/dcc_tty.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/dcc_tty.c b/drivers/char/dcc_tty.c index 0a62d410286f..a787accdcb14 100644 --- a/drivers/char/dcc_tty.c +++ b/drivers/char/dcc_tty.c @@ -26,7 +26,7 @@ MODULE_DESCRIPTION("DCC TTY Driver"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.0"); -DEFINE_SPINLOCK(g_dcc_tty_lock); +static spinlock_t g_dcc_tty_lock = SPIN_LOCK_UNLOCKED; static struct hrtimer g_dcc_timer; static char g_dcc_buffer[16]; static int g_dcc_buffer_head; @@ -80,8 +80,8 @@ static void dcc_poll_locked(void) ); if (rch >= 0) { ch = rch; - tty_insert_flip_string(g_dcc_tty->port, &ch, 1); - tty_flip_buffer_push(g_dcc_tty->port); + tty_insert_flip_string(g_dcc_tty, &ch, 1); + tty_flip_buffer_push(g_dcc_tty); } } From fd2e341b4741a29e04f2010ced14edf5d2aa3ee0 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 26 May 2016 12:58:21 +0530 Subject: [PATCH 0135/3220] Revert "armv6 dcc tty driver" This reverts commit 97312429c2bef1bf8055d01b35cf12028f60ef62. Drop AOSP's "armv6 dcc tty driver" in favor of upstream DCC driver for ARMv6/v7 16c63f8ea49c (drivers: char: hvc: add arm JTAG DCC console support) and for ARMv8 4cad4c57e0b3 (ARM64: TTY: hvc_dcc: Add support for ARM64 dcc). Change-Id: I0ca651ef2d854fff03cee070524fe1e3971b6d8f Signed-off-by: Amit Pundir --- drivers/char/Kconfig | 4 - drivers/char/Makefile | 1 - drivers/char/dcc_tty.c | 326 ----------------------------------------- 3 files changed, 331 deletions(-) delete mode 100644 drivers/char/dcc_tty.c diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 29dbe0fbdf8b..a043107da2af 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -588,10 +588,6 @@ config DEVPORT depends on ISA || PCI default y -config DCC_TTY - tristate "DCC tty driver" - depends on ARM - source "drivers/s390/char/Kconfig" config TILE_SROM diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 7671551e3196..d8a7579300d2 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -53,7 +53,6 @@ obj-$(CONFIG_PCMCIA) += pcmcia/ obj-$(CONFIG_HANGCHECK_TIMER) += hangcheck-timer.o obj-$(CONFIG_TCG_TPM) += tpm/ -obj-$(CONFIG_DCC_TTY) += dcc_tty.o obj-$(CONFIG_PS3_FLASH) += ps3flash.o obj-$(CONFIG_JS_RTC) += js-rtc.o diff --git a/drivers/char/dcc_tty.c b/drivers/char/dcc_tty.c deleted file mode 100644 index a787accdcb14..000000000000 --- a/drivers/char/dcc_tty.c +++ /dev/null @@ -1,326 +0,0 @@ -/* drivers/char/dcc_tty.c - * - * Copyright (C) 2007 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_DESCRIPTION("DCC TTY Driver"); -MODULE_LICENSE("GPL"); -MODULE_VERSION("1.0"); - -static spinlock_t g_dcc_tty_lock = SPIN_LOCK_UNLOCKED; -static struct hrtimer g_dcc_timer; -static char g_dcc_buffer[16]; -static int g_dcc_buffer_head; -static int g_dcc_buffer_count; -static unsigned g_dcc_write_delay_usecs = 1; -static struct tty_driver *g_dcc_tty_driver; -static struct tty_struct *g_dcc_tty; -static int g_dcc_tty_open_count; - -static void dcc_poll_locked(void) -{ - char ch; - int rch; - int written; - - while (g_dcc_buffer_count) { - ch = g_dcc_buffer[g_dcc_buffer_head]; - asm( - "mrc 14, 0, r15, c0, c1, 0\n" - "mcrcc 14, 0, %1, c0, c5, 0\n" - "movcc %0, #1\n" - "movcs %0, #0\n" - : "=r" (written) - : "r" (ch) - ); - if (written) { - if (ch == '\n') - g_dcc_buffer[g_dcc_buffer_head] = '\r'; - else { - g_dcc_buffer_head = (g_dcc_buffer_head + 1) % ARRAY_SIZE(g_dcc_buffer); - g_dcc_buffer_count--; - if (g_dcc_tty) - tty_wakeup(g_dcc_tty); - } - g_dcc_write_delay_usecs = 1; - } else { - if (g_dcc_write_delay_usecs > 0x100) - break; - g_dcc_write_delay_usecs <<= 1; - udelay(g_dcc_write_delay_usecs); - } - } - - if (g_dcc_tty && !test_bit(TTY_THROTTLED, &g_dcc_tty->flags)) { - asm( - "mrc 14, 0, %0, c0, c1, 0\n" - "tst %0, #(1 << 30)\n" - "moveq %0, #-1\n" - "mrcne 14, 0, %0, c0, c5, 0\n" - : "=r" (rch) - ); - if (rch >= 0) { - ch = rch; - tty_insert_flip_string(g_dcc_tty, &ch, 1); - tty_flip_buffer_push(g_dcc_tty); - } - } - - - if (g_dcc_buffer_count) - hrtimer_start(&g_dcc_timer, ktime_set(0, g_dcc_write_delay_usecs * NSEC_PER_USEC), HRTIMER_MODE_REL); - else - hrtimer_start(&g_dcc_timer, ktime_set(0, 20 * NSEC_PER_MSEC), HRTIMER_MODE_REL); -} - -static int dcc_tty_open(struct tty_struct * tty, struct file * filp) -{ - int ret; - unsigned long irq_flags; - - spin_lock_irqsave(&g_dcc_tty_lock, irq_flags); - if (g_dcc_tty == NULL || g_dcc_tty == tty) { - g_dcc_tty = tty; - g_dcc_tty_open_count++; - ret = 0; - } else - ret = -EBUSY; - spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags); - - printk("dcc_tty_open, tty %p, f_flags %x, returned %d\n", tty, filp->f_flags, ret); - - return ret; -} - -static void dcc_tty_close(struct tty_struct * tty, struct file * filp) -{ - printk("dcc_tty_close, tty %p, f_flags %x\n", tty, filp->f_flags); - if (g_dcc_tty == tty) { - if (--g_dcc_tty_open_count == 0) - g_dcc_tty = NULL; - } -} - -static int dcc_write(const unsigned char *buf_start, int count) -{ - const unsigned char *buf = buf_start; - unsigned long irq_flags; - int copy_len; - int space_left; - int tail; - - if (count < 1) - return 0; - - spin_lock_irqsave(&g_dcc_tty_lock, irq_flags); - do { - tail = (g_dcc_buffer_head + g_dcc_buffer_count) % ARRAY_SIZE(g_dcc_buffer); - copy_len = ARRAY_SIZE(g_dcc_buffer) - tail; - space_left = ARRAY_SIZE(g_dcc_buffer) - g_dcc_buffer_count; - if (copy_len > space_left) - copy_len = space_left; - if (copy_len > count) - copy_len = count; - memcpy(&g_dcc_buffer[tail], buf, copy_len); - g_dcc_buffer_count += copy_len; - buf += copy_len; - count -= copy_len; - if (copy_len < count && copy_len < space_left) { - space_left -= copy_len; - copy_len = count; - if (copy_len > space_left) { - copy_len = space_left; - } - memcpy(g_dcc_buffer, buf, copy_len); - buf += copy_len; - count -= copy_len; - g_dcc_buffer_count += copy_len; - } - dcc_poll_locked(); - space_left = ARRAY_SIZE(g_dcc_buffer) - g_dcc_buffer_count; - } while(count && space_left); - spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags); - return buf - buf_start; -} - -static int dcc_tty_write(struct tty_struct * tty, const unsigned char *buf, int count) -{ - int ret; - /* printk("dcc_tty_write %p, %d\n", buf, count); */ - ret = dcc_write(buf, count); - if (ret != count) - printk("dcc_tty_write %p, %d, returned %d\n", buf, count, ret); - return ret; -} - -static int dcc_tty_write_room(struct tty_struct *tty) -{ - int space_left; - unsigned long irq_flags; - - spin_lock_irqsave(&g_dcc_tty_lock, irq_flags); - space_left = ARRAY_SIZE(g_dcc_buffer) - g_dcc_buffer_count; - spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags); - return space_left; -} - -static int dcc_tty_chars_in_buffer(struct tty_struct *tty) -{ - int ret; - asm( - "mrc 14, 0, %0, c0, c1, 0\n" - "mov %0, %0, LSR #30\n" - "and %0, %0, #1\n" - : "=r" (ret) - ); - return ret; -} - -static void dcc_tty_unthrottle(struct tty_struct * tty) -{ - unsigned long irq_flags; - - spin_lock_irqsave(&g_dcc_tty_lock, irq_flags); - dcc_poll_locked(); - spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags); -} - -static enum hrtimer_restart dcc_tty_timer_func(struct hrtimer *timer) -{ - unsigned long irq_flags; - - spin_lock_irqsave(&g_dcc_tty_lock, irq_flags); - dcc_poll_locked(); - spin_unlock_irqrestore(&g_dcc_tty_lock, irq_flags); - return HRTIMER_NORESTART; -} - -void dcc_console_write(struct console *co, const char *b, unsigned count) -{ -#if 1 - dcc_write(b, count); -#else - /* blocking printk */ - while (count > 0) { - int written; - written = dcc_write(b, count); - if (written) { - b += written; - count -= written; - } - } -#endif -} - -static struct tty_driver *dcc_console_device(struct console *c, int *index) -{ - *index = 0; - return g_dcc_tty_driver; -} - -static int __init dcc_console_setup(struct console *co, char *options) -{ - if (co->index != 0) - return -ENODEV; - return 0; -} - - -static struct console dcc_console = -{ - .name = "ttyDCC", - .write = dcc_console_write, - .device = dcc_console_device, - .setup = dcc_console_setup, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -static struct tty_operations dcc_tty_ops = { - .open = dcc_tty_open, - .close = dcc_tty_close, - .write = dcc_tty_write, - .write_room = dcc_tty_write_room, - .chars_in_buffer = dcc_tty_chars_in_buffer, - .unthrottle = dcc_tty_unthrottle, -}; - -static int __init dcc_tty_init(void) -{ - int ret; - - hrtimer_init(&g_dcc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - g_dcc_timer.function = dcc_tty_timer_func; - - g_dcc_tty_driver = alloc_tty_driver(1); - if (!g_dcc_tty_driver) { - printk(KERN_ERR "dcc_tty_probe: alloc_tty_driver failed\n"); - ret = -ENOMEM; - goto err_alloc_tty_driver_failed; - } - g_dcc_tty_driver->owner = THIS_MODULE; - g_dcc_tty_driver->driver_name = "dcc"; - g_dcc_tty_driver->name = "ttyDCC"; - g_dcc_tty_driver->major = 0; // auto assign - g_dcc_tty_driver->minor_start = 0; - g_dcc_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; - g_dcc_tty_driver->subtype = SERIAL_TYPE_NORMAL; - g_dcc_tty_driver->init_termios = tty_std_termios; - g_dcc_tty_driver->flags = TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV; - tty_set_operations(g_dcc_tty_driver, &dcc_tty_ops); - ret = tty_register_driver(g_dcc_tty_driver); - if (ret) { - printk(KERN_ERR "dcc_tty_probe: tty_register_driver failed, %d\n", ret); - goto err_tty_register_driver_failed; - } - tty_register_device(g_dcc_tty_driver, 0, NULL); - - register_console(&dcc_console); - hrtimer_start(&g_dcc_timer, ktime_set(0, 0), HRTIMER_MODE_REL); - - return 0; - -err_tty_register_driver_failed: - put_tty_driver(g_dcc_tty_driver); - g_dcc_tty_driver = NULL; -err_alloc_tty_driver_failed: - return ret; -} - -static void __exit dcc_tty_exit(void) -{ - int ret; - - tty_unregister_device(g_dcc_tty_driver, 0); - ret = tty_unregister_driver(g_dcc_tty_driver); - if (ret < 0) { - printk(KERN_ERR "dcc_tty_remove: tty_unregister_driver failed, %d\n", ret); - } else { - put_tty_driver(g_dcc_tty_driver); - } - g_dcc_tty_driver = NULL; -} - -module_init(dcc_tty_init); -module_exit(dcc_tty_exit); - - From ef55b4532be92094b0a1f03f2e1a50d4b49a30ae Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2016 10:18:51 +0100 Subject: [PATCH 0136/3220] UPSTREAM: arm64: module: fix relocation of movz instruction with negative immediate The test whether a movz instruction with a signed immediate should be turned into a movn instruction (i.e., when the immediate is negative) is flawed, since the value of imm is always positive. Also, the subsequent bounds check is incorrect since the limit update never executes, due to the fact that the imm_type comparison will always be false for negative signed immediates. Let's fix this by performing the sign test on sval directly, and replacing the bounds check with a simple comparison against U16_MAX. Change-Id: I9ad3d8bfd91e5fdc6434b1be6c3062dfec193176 Signed-off-by: Ard Biesheuvel [will: tidied up use of sval, renamed MOVK enum value to MOVKZ] Signed-off-by: Will Deacon --- arch/arm64/kernel/module.c | 51 ++++++++++++++------------------------ 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index f4bc779e62e8..03464ab0fff2 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -30,9 +30,6 @@ #include #include -#define AARCH64_INSN_IMM_MOVNZ AARCH64_INSN_IMM_MAX -#define AARCH64_INSN_IMM_MOVK AARCH64_INSN_IMM_16 - void *module_alloc(unsigned long size) { void *p; @@ -110,16 +107,20 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) return 0; } +enum aarch64_insn_movw_imm_type { + AARCH64_INSN_IMM_MOVNZ, + AARCH64_INSN_IMM_MOVKZ, +}; + static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, - int lsb, enum aarch64_insn_imm_type imm_type) + int lsb, enum aarch64_insn_movw_imm_type imm_type) { - u64 imm, limit = 0; + u64 imm; s64 sval; u32 insn = le32_to_cpu(*(u32 *)place); sval = do_reloc(op, place, val); - sval >>= lsb; - imm = sval & 0xffff; + imm = sval >> lsb; if (imm_type == AARCH64_INSN_IMM_MOVNZ) { /* @@ -128,7 +129,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, * immediate is less than zero. */ insn &= ~(3 << 29); - if ((s64)imm >= 0) { + if (sval >= 0) { /* >=0: Set the instruction to MOVZ (opcode 10b). */ insn |= 2 << 29; } else { @@ -140,29 +141,13 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, */ imm = ~imm; } - imm_type = AARCH64_INSN_IMM_MOVK; } /* Update the instruction with the new encoding. */ - insn = aarch64_insn_encode_immediate(imm_type, insn, imm); + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); *(u32 *)place = cpu_to_le32(insn); - /* Shift out the immediate field. */ - sval >>= 16; - - /* - * For unsigned immediates, the overflow check is straightforward. - * For signed immediates, the sign bit is actually the bit past the - * most significant bit of the field. - * The AARCH64_INSN_IMM_16 immediate type is unsigned. - */ - if (imm_type != AARCH64_INSN_IMM_16) { - sval++; - limit++; - } - - /* Check the upper bits depending on the sign of the immediate. */ - if ((u64)sval > limit) + if (imm > U16_MAX) return -ERANGE; return 0; @@ -267,25 +252,25 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, overflow_check = false; case R_AARCH64_MOVW_UABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G1_NC: overflow_check = false; case R_AARCH64_MOVW_UABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G2_NC: overflow_check = false; case R_AARCH64_MOVW_UABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_SABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, @@ -302,7 +287,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_MOVW_PREL_G0_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVK); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_PREL_G0: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, @@ -311,7 +296,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_MOVW_PREL_G1_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVK); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_PREL_G1: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, @@ -320,7 +305,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_MOVW_PREL_G2_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVK); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_PREL_G2: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, From d8853fd2e900115923bb36a9137d6b3f1095ee70 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2016 10:18:52 +0100 Subject: [PATCH 0137/3220] UPSTREAM: arm64: module: avoid undefined shift behavior in reloc_data() Compilers may engage the improbability drive when encountering shifts by a distance that is a multiple of the size of the operand type. Since the required bounds check is very simple here, we can get rid of all the fuzzy masking, shifting and comparing, and use the documented bounds directly. Change-Id: Ibc1b73f4a630bc182deb6edfa7458b5e29ba9577 Reported-by: David Binderman Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon --- arch/arm64/kernel/module.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 03464ab0fff2..93e970231ca9 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -72,15 +72,18 @@ static u64 do_reloc(enum aarch64_reloc_op reloc_op, void *place, u64 val) static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) { - u64 imm_mask = (1 << len) - 1; s64 sval = do_reloc(op, place, val); switch (len) { case 16: *(s16 *)place = sval; + if (sval < S16_MIN || sval > U16_MAX) + return -ERANGE; break; case 32: *(s32 *)place = sval; + if (sval < S32_MIN || sval > U32_MAX) + return -ERANGE; break; case 64: *(s64 *)place = sval; @@ -89,21 +92,6 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) pr_err("Invalid length (%d) for data relocation\n", len); return 0; } - - /* - * Extract the upper value bits (including the sign bit) and - * shift them to bit 0. - */ - sval = (s64)(sval & ~(imm_mask >> 1)) >> (len - 1); - - /* - * Overflow has occurred if the value is not representable in - * len bits (i.e the bottom len bits are not sign-extended and - * the top bits are not all zero). - */ - if ((u64)(sval + 1) > 2) - return -ERANGE; - return 0; } From ebac2a3dcd458126b8c918b3315b1367b94f274d Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 19 Jan 2016 21:35:15 +0000 Subject: [PATCH 0138/3220] BACKPORT: perf tools: Document the perf sysctls perf_event_paranoid was only documented in source code and a perf error message. Copy the documentation from the error message to Documentation/sysctl/kernel.txt. perf_cpu_time_max_percent was already documented but missing from the list at the top, so add it there. Signed-off-by: Ben Hutchings Cc: Peter Zijlstra Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20160119213515.GG2637@decadent.org.uk [ Remove reference to external Documentation file, provide info inline, as before ] Signed-off-by: Arnaldo Carvalho de Melo Bug: 29054680 Change-Id: I13e73cfb2ad761c94762d0c8196df7725abdf5c5 --- Documentation/sysctl/kernel.txt | 13 +++++++++++++ tools/perf/util/evsel.c | 15 +++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index af70d1541d3a..88a2c8eb4502 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -58,6 +58,8 @@ show up in /proc/sys/kernel: - panic_on_stackoverflow - panic_on_unrecovered_nmi - panic_on_warn +- perf_cpu_time_max_percent +- perf_event_paranoid - pid_max - powersave-nap [ PPC only ] - printk @@ -624,6 +626,17 @@ allowed to execute. ============================================================== +perf_event_paranoid: + +Controls use of the performance events system by unprivileged +users (without CAP_SYS_ADMIN). The default value is 1. + + -1: Allow use of (almost) all events by all users +>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK +>=1: Disallow CPU event access by users without CAP_SYS_ADMIN +>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN + +============================================================== pid_max: diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 397fb4ed3c97..d4913a46ee1c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2313,12 +2313,15 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, case EPERM: case EACCES: return scnprintf(msg, size, - "You may not have permission to collect %sstats.\n" - "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n" - " -1 - Not paranoid at all\n" - " 0 - Disallow raw tracepoint access for unpriv\n" - " 1 - Disallow cpu events for unpriv\n" - " 2 - Disallow kernel profiling for unpriv", + "You may not have permission to collect %sstats.\n\n" + "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n" + "which controls use of the performance events system by\n" + "unprivileged users (without CAP_SYS_ADMIN).\n\n" + "The default value is 1:\n\n" + " -1: Allow use of (almost) all events by all users\n" + ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n" + ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n" + ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN", target->system_wide ? "system-wide " : ""); case ENOENT: return scnprintf(msg, size, "The %s event is not supported.", From 9d5f5d93461e2b6f697054c06ed26bc5fa4d629e Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Sun, 29 May 2016 14:22:32 -0700 Subject: [PATCH 0139/3220] FROMLIST: security,perf: Allow further restriction of perf_event_open When kernel.perf_event_open is set to 3 (or greater), disallow all access to performance events by users without CAP_SYS_ADMIN. Add a Kconfig symbol CONFIG_SECURITY_PERF_EVENTS_RESTRICT that makes this value the default. This is based on a similar feature in grsecurity (CONFIG_GRKERNSEC_PERF_HARDEN). This version doesn't include making the variable read-only. It also allows enabling further restriction at run-time regardless of whether the default is changed. https://lkml.org/lkml/2016/1/11/587 Signed-off-by: Ben Hutchings Bug: 29054680 Change-Id: Iff5bff4fc1042e85866df9faa01bce8d04335ab8 --- Documentation/sysctl/kernel.txt | 4 +++- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 8 ++++++++ security/Kconfig | 9 +++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 88a2c8eb4502..5728779df1ab 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -629,12 +629,14 @@ allowed to execute. perf_event_paranoid: Controls use of the performance events system by unprivileged -users (without CAP_SYS_ADMIN). The default value is 1. +users (without CAP_SYS_ADMIN). The default value is 3 if +CONFIG_SECURITY_PERF_EVENTS_RESTRICT is set, or 1 otherwise. -1: Allow use of (almost) all events by all users >=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK >=1: Disallow CPU event access by users without CAP_SYS_ADMIN >=2: Disallow kernel profiling by users without CAP_SYS_ADMIN +>=3: Disallow all event access by users without CAP_SYS_ADMIN ============================================================== diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f9828a48f16a..aa7294092e51 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -989,6 +989,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, loff_t *ppos); +static inline bool perf_paranoid_any(void) +{ + return sysctl_perf_event_paranoid > 2; +} + static inline bool perf_paranoid_tracepoint_raw(void) { return sysctl_perf_event_paranoid > -1; diff --git a/kernel/events/core.c b/kernel/events/core.c index cfc227ccfceb..85bc810bece6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -175,8 +175,13 @@ static struct srcu_struct pmus_srcu; * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv + * 3 - disallow all unpriv perf event use */ +#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT +int sysctl_perf_event_paranoid __read_mostly = 3; +#else int sysctl_perf_event_paranoid __read_mostly = 1; +#endif /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ @@ -8265,6 +8270,9 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; + if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; diff --git a/security/Kconfig b/security/Kconfig index e45237897b43..30a2603e8c85 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -18,6 +18,15 @@ config SECURITY_DMESG_RESTRICT If you are unsure how to answer this question, answer N. +config SECURITY_PERF_EVENTS_RESTRICT + bool "Restrict unprivileged use of performance events" + depends on PERF_EVENTS + help + If you say Y here, the kernel.perf_event_paranoid sysctl + will be set to 3 by default, and no unprivileged use of the + perf_event_open syscall will be permitted unless it is + changed. + config SECURITY bool "Enable different security models" depends on SYSFS From 0ac94b48c0c947accdacf310c068f6bad4796d02 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Wed, 1 Jun 2016 13:44:47 -0700 Subject: [PATCH 0140/3220] ANDROID: restrict access to perf events Add: CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y to android-base.cfg The kernel.perf_event_paranoid sysctl is set to 3 by default. No unprivileged use of the perf_event_open syscall will be permitted unless it is changed. Bug: 29054680 Change-Id: Ie7512259150e146d8e382dc64d40e8faaa438917 --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 304f1d4fd7c4..6db5542a51f4 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -145,6 +145,7 @@ CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y CONFIG_SECURITY_SELINUX=y CONFIG_SETEND_EMULATION=y CONFIG_STAGING=y From 249d2baf9b494381f2ff4f21feeaaa47eaa78e7c Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 3 Jun 2016 14:06:14 -0700 Subject: [PATCH 0141/3220] ANDROID: dm verity fec: limit error correction recursion If verity tree itself is sufficiently corrupted in addition to data blocks, it's possible for error correction to end up in a deep recursive error correction loop that eventually causes a kernel panic as follows: [ 14.728962] [] verity_fec_decode+0xa8/0x138 [ 14.734691] [] verity_verify_level+0x11c/0x180 [ 14.740681] [] verity_hash_for_block+0x88/0xe0 [ 14.746671] [] fec_decode_rsb+0x318/0x75c [ 14.752226] [] verity_fec_decode+0xa8/0x138 [ 14.757956] [] verity_verify_level+0x11c/0x180 [ 14.763944] [] verity_hash_for_block+0x88/0xe0 This change limits the recursion to a reasonable level during a single I/O operation. Bug: 28943429 Signed-off-by: Sami Tolvanen Change-Id: I0a7ebff331d259c59a5e03c81918cc1613c3a766 (cherry picked from commit f4b9e40597e73942d2286a73463c55f26f61bfa7) --- drivers/md/dm-verity-fec.c | 11 ++++++++++- drivers/md/dm-verity-fec.h | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index ad10d6d8ed28..b26809a47ca3 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -442,6 +442,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (!verity_fec_is_enabled(v)) return -EOPNOTSUPP; + if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) { + DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name); + return -EIO; + } + + fio->level++; + if (type == DM_VERITY_BLOCK_TYPE_METADATA) block += v->data_blocks; @@ -475,7 +482,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (r < 0) { r = fec_decode_rsb(v, io, fio, rsb, offset, true); if (r < 0) - return r; + goto done; } if (dest) @@ -485,6 +492,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, r = verity_for_bv_block(v, io, iter, fec_bv_copy); } +done: + fio->level--; return r; } diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 8c4bee052a73..b8e21cef3ad1 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -28,6 +28,9 @@ #define DM_VERITY_FEC_BUF_MAX \ (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) +/* maximum recursion level for verity_fec_decode */ +#define DM_VERITY_FEC_MAX_RECURSION 4 + #define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" #define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" #define DM_VERITY_OPT_FEC_START "fec_start" @@ -61,6 +64,7 @@ struct dm_verity_fec_io { unsigned nbufs; /* number of buffers allocated */ u8 *output; /* buffer for corrected output */ size_t output_pos; + unsigned level; /* recursion level */ }; #ifdef CONFIG_DM_VERITY_FEC From c4d8e3e8d2aa09f5826f517339baa7632f7873ea Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 3 Jun 2016 14:22:46 -0700 Subject: [PATCH 0142/3220] ANDROID: dm verity fec: add missing release from fec_ktype Add a release function to allow destroying the dm-verity device. Bug: 27928374 Signed-off-by: Sami Tolvanen Change-Id: Ic0f7c17e4889c5580d70b52d9a709a37165a5747 (cherry picked from commit 0039ccf47c8f99888f7b71b2a36a68a027fbe357) --- drivers/md/dm-verity-fec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index b26809a47ca3..454535d23a7f 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -689,7 +689,8 @@ static struct attribute *fec_attrs[] = { static struct kobj_type fec_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = fec_attrs + .default_attrs = fec_attrs, + .release = dm_kobject_release }; /* From 8f9576b38193b8eb32d94d17f02baab436c3580a Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 17 Jun 2016 11:22:03 -0700 Subject: [PATCH 0143/3220] ANDROID: dm verity fec: fix RS block calculation A call to do_div was changed in Linux 4.5 to div64_u64 in verity_fec_decode, which broke RS block calculation due to incompatible semantics. This change fixes the computation. Bug: 21893453 Change-Id: Idb88b901e0209c2cccc9c0796689f780592d58f9 Signed-off-by: Sami Tolvanen (cherry picked from commit 879aac93eebcc2862d71afa9eca3a0c0f51b3b01) --- drivers/md/dm-verity-fec.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 454535d23a7f..a1e8571ce314 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -463,9 +463,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, */ offset = block << v->data_dev_block_bits; - - res = offset; - div64_u64(res, v->fec->rounds << v->data_dev_block_bits); + res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits); /* * The base RS block we can feed to the interleaver to find out all From d053106b93363b77ef8ca60352069cbdc8fa0f87 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 17 Jun 2016 11:31:17 -0700 Subject: [PATCH 0144/3220] ANDROID: dm verity fec: initialize recursion level Explicitly initialize recursion level to zero at the beginning of each I/O operation. Bug: 28943429 Change-Id: I00c612be2b8c22dd5afb65a739551df91cb324fc Signed-off-by: Sami Tolvanen (cherry picked from commit 32ffb3a22d7fd269b2961323478ece92c06a8334) --- drivers/md/dm-verity-fec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index a1e8571ce314..1dd667b97530 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -532,6 +532,7 @@ void verity_fec_init_io(struct dm_verity_io *io) memset(fio->bufs, 0, sizeof(fio->bufs)); fio->nbufs = 0; fio->output = NULL; + fio->level = 0; } /* From ab4f14dd052ba453558a8913d8d5547abbfc7b88 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Mon, 7 Mar 2016 11:31:10 +0100 Subject: [PATCH 0145/3220] UPSTREAM: usbnet: cleanup after bind() in probe() (cherry pick from commit 1666984c8625b3db19a9abc298931d35ab7bc64b) In case bind() works, but a later error forces bailing in probe() in error cases work and a timer may be scheduled. They must be killed. This fixes an error case related to the double free reported in http://www.spinics.net/lists/netdev/msg367669.html and needs to go on top of Linus' fix to cdc-ncm. Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller Bug: 28744625 --- drivers/net/usb/usbnet.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 0744bf2ef2d6..c2ea4e5666fb 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1766,6 +1766,13 @@ out3: if (info->unbind) info->unbind (dev, udev); out1: + /* subdrivers must undo all they did in bind() if they + * fail it, but we may fail later and a deferred kevent + * may trigger an error resubmitting itself and, worse, + * schedule a timer. So we kill it all just in case. + */ + cancel_work_sync(&dev->kevent); + del_timer_sync(&dev->delay); free_netdev(net); out: return status; From 872e24621a63b75b3b644f6369b04bfb092264b1 Mon Sep 17 00:00:00 2001 From: Thierry Strudel Date: Tue, 14 Jun 2016 17:46:44 -0700 Subject: [PATCH 0146/3220] ANDROID: cpu: send KOBJ_ONLINE event when enabling cpus In case some sysfs nodes needs to be labeled with a different label than sysfs then user needs to be notified when a core is brought back online. Signed-off-by: Thierry Strudel Bug: 29359497 Change-Id: I0395c86e01cd49c348fda8f93087d26f88557c91 --- kernel/cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 37731292f8a1..9ced7c751648 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -627,6 +627,7 @@ void __weak arch_enable_nonboot_cpus_end(void) void enable_nonboot_cpus(void) { int cpu, error; + struct device *cpu_device; /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); @@ -644,6 +645,12 @@ void enable_nonboot_cpus(void) trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); + cpu_device = get_cpu_device(cpu); + if (!cpu_device) + pr_err("%s: failed to get cpu%d device\n", + __func__, cpu); + else + kobject_uevent(&cpu_device->kobj, KOBJ_ONLINE); continue; } pr_warn("Error taking CPU%d up: %d\n", cpu, error); From 6da57e2f0beee58a6570719a865f4e1b1f7822c2 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 23 Jun 2016 11:51:39 +0530 Subject: [PATCH 0147/3220] ANDROID: configs: remove unused configs Remove following configs which no longer exist: CONFIG_IP6_NF_TARGET_REJECT_SKERR CONFIG_IP_NF_TARGET_REJECT_SKERR CONFIG_RESOURCE_COUNTERS CONFIG_TABLET_USB_WACOM Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 3 --- android/configs/android-recommended.cfg | 1 - 2 files changed, 4 deletions(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 6db5542a51f4..bdd99f2becfc 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -37,7 +37,6 @@ CONFIG_IP6_NF_IPTABLES=y CONFIG_IP6_NF_MANGLE=y CONFIG_IP6_NF_RAW=y CONFIG_IP6_NF_TARGET_REJECT=y -CONFIG_IP6_NF_TARGET_REJECT_SKERR=y CONFIG_IPV6=y CONFIG_IPV6_MIP6=y CONFIG_IPV6_MULTIPLE_TABLES=y @@ -64,7 +63,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y CONFIG_IP_NF_TARGET_NETMAP=y CONFIG_IP_NF_TARGET_REDIRECT=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_TARGET_REJECT_SKERR=y CONFIG_NET=y CONFIG_NETDEVICES=y CONFIG_NETFILTER=y @@ -140,7 +138,6 @@ CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y CONFIG_QUOTA=y -CONFIG_RESOURCE_COUNTERS=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECURITY=y diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 35936afdcae4..c3222a77ba24 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -110,7 +110,6 @@ CONFIG_TABLET_USB_AIPTEK=y CONFIG_TABLET_USB_GTCO=y CONFIG_TABLET_USB_HANWANG=y CONFIG_TABLET_USB_KBTAB=y -CONFIG_TABLET_USB_WACOM=y CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_IO_ACCOUNTING=y From b40d9bcad6420669ccf41f69278506125fa68428 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:46:24 -0400 Subject: [PATCH 0148/3220] UPSTREAM: net: fix infoleak in rtnetlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 5f8e44741f9f216e33736ea4ec65ca9ac03036e6) The stack object “map” has a total size of 32 bytes. Its last 4 bytes are padding generated by compiler. These padding bytes are not initialized and sent out via “nla_put”. Signed-off-by: Kangjie Lu Signed-off-by: David S. Miller Bug: 28620102 Change-Id: Ica015c6a90d47e9188b1cd87a280ac6819dd9d09 --- net/core/rtnetlink.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 34ba7a08876d..9c6d15756e7a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1174,14 +1174,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; + struct rtnl_link_ifmap map; + + memset(&map, 0, sizeof(map)); + map.mem_start = dev->mem_start; + map.mem_end = dev->mem_end; + map.base_addr = dev->base_addr; + map.irq = dev->irq; + map.dma = dev->dma; + map.port = dev->if_port; + if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) return -EMSGSIZE; From 14dfd8269605b9dc3dcdd10945398d3822f07953 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:07 -0400 Subject: [PATCH 0149/3220] UPSTREAM: ALSA: timer: Fix leak in SNDRV_TIMER_IOCTL_PARAMS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit cec8f96e49d9be372fdb0c3836dcf31ec71e457e) The stack object “tread” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Bug: 28980557 Change-Id: Ibda2d126f6d72fedf797a98796c3cde7bb03db76 --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 8b06413e967e..8b6c3605d2e9 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1680,6 +1680,7 @@ static int snd_timer_user_params(struct file *file, if (tu->timeri->flags & SNDRV_TIMER_IFLG_EARLY_EVENT) { if (tu->tread) { struct snd_timer_tread tread; + memset(&tread, 0, sizeof(tread)); tread.event = SNDRV_TIMER_EVENT_EARLY; tread.tstamp.tv_sec = 0; tread.tstamp.tv_nsec = 0; From 29d02a76f5ff4af4eeb5f1ef3b09cd5b48fc2662 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:32 -0400 Subject: [PATCH 0150/3220] UPSTREAM: ALSA: timer: Fix leak in events via snd_timer_user_tinterrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit e4ec8cc8039a7063e24204299b462bd1383184a5) The stack object “r1” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Bug: 28980217 Change-Id: If2bba3c9ffb4e57190583b0bb2524d3b2514b2a3 --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index 8b6c3605d2e9..ded95e6f201f 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1215,6 +1215,7 @@ static void snd_timer_user_tinterrupt(struct snd_timer_instance *timeri, } if ((tu->filter & (1 << SNDRV_TIMER_EVENT_RESOLUTION)) && tu->last_resolution != resolution) { + memset(&r1, 0, sizeof(r1)); r1.event = SNDRV_TIMER_EVENT_RESOLUTION; r1.tstamp = tstamp; r1.val = resolution; From 3d1e8cf7ef81f13d4da5218174d12a40e8bf1116 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:44:20 -0400 Subject: [PATCH 0151/3220] UPSTREAM: ALSA: timer: Fix leak in events via snd_timer_user_ccallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 9a47e9cff994f37f7f0dbd9ae23740d0f64f9fe6) The stack object “r1” has a total size of 32 bytes. Its field “event” and “val” both contain 4 bytes padding. These 8 bytes padding bytes are sent to user without being initialized. Signed-off-by: Kangjie Lu Signed-off-by: Takashi Iwai Bug: 28980217 Change-Id: I2e4c27352894b9f1f4c808b8db3ae5f9284faec1 --- sound/core/timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/timer.c b/sound/core/timer.c index ded95e6f201f..1701703b8d93 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1181,6 +1181,7 @@ static void snd_timer_user_ccallback(struct snd_timer_instance *timeri, tu->tstamp = *tstamp; if ((tu->filter & (1 << event)) == 0 || !tu->tread) return; + memset(&r1, 0, sizeof(r1)); r1.event = event; r1.tstamp = *tstamp; r1.val = resolution; From 659740adf180779d0177a086656880bc7a43cfd1 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 3 May 2016 16:32:16 -0400 Subject: [PATCH 0152/3220] UPSTREAM: USB: usbfs: fix potential infoleak in devio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 681fef8380eb818c0b845fca5d2ab1dcbab114ee) The stack object “ci” has a total size of 8 bytes. Its last 3 bytes are padding bytes which are not initialized and leaked to userland via “copy_to_user”. Signed-off-by: Kangjie Lu Signed-off-by: Greg Kroah-Hartman Bug: 28619695 Change-Id: I170754d659d0891c075f85211b5e3970b114f097 --- drivers/usb/core/devio.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 38ae877c46e3..3ffb01ff6549 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1203,10 +1203,11 @@ static int proc_getdriver(struct usb_dev_state *ps, void __user *arg) static int proc_connectinfo(struct usb_dev_state *ps, void __user *arg) { - struct usbdevfs_connectinfo ci = { - .devnum = ps->dev->devnum, - .slow = ps->dev->speed == USB_SPEED_LOW - }; + struct usbdevfs_connectinfo ci; + + memset(&ci, 0, sizeof(ci)); + ci.devnum = ps->dev->devnum; + ci.slow = ps->dev->speed == USB_SPEED_LOW; if (copy_to_user(arg, &ci, sizeof(ci))) return -EFAULT; From 7dbe59704ab0dd0a07ae2c2b7b590bb2b8ccd6d9 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 23 Jun 2016 15:35:07 +0530 Subject: [PATCH 0153/3220] ANDROID: base-cfg: enable UID_CPUTIME Enabled UID_CPUTIME and dependent PROFILING config option. UID_CPUTIME (/proc/uid_cputime) interfaces provide amount of time a UID's processes spent executing in user-space and kernel-space. It is used by batterystats service. Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index bdd99f2becfc..6496bb3961a2 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -137,6 +137,7 @@ CONFIG_PPP_BSDCOMP=y CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y +CONFIG_PROFILING=y CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y @@ -149,6 +150,7 @@ CONFIG_STAGING=y CONFIG_SWP_EMULATION=y CONFIG_SYNC=y CONFIG_TUN=y +CONFIG_UID_CPUTIME=y CONFIG_UNIX=y CONFIG_USB_GADGET=y CONFIG_USB_CONFIGFS=y From ef51a2254d6579560bf7ff8ebbeb9b3848fe10bf Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Thu, 7 Jan 2016 16:46:14 +0100 Subject: [PATCH 0154/3220] BACKPORT: PM / sleep: Go direct_complete if driver has no callbacks Backport notes: This resolves clk warnings in the designware i2c driver on HiKey seen during suspend/resume. Cherrypicked from: aa8e54b559479d0cb7eb632ba443b8cacd20cd4b If a suitable prepare callback cannot be found for a given device and its driver has no PM callbacks at all, assume that it can go direct to complete when the system goes to sleep. The reason for this is that there's lots of devices in a system that do no PM at all and there's no reason for them to prevent their ancestors to do direct_complete if they can support it. Change-Id: Ia773afb4b266f012336b99fc8cf87453839e078b Signed-off-by: Tomeu Vizoso Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki [jstultz: Backported to 4.4] Signed-off-by: John Stultz --- drivers/base/dd.c | 3 +++ drivers/base/power/main.c | 35 +++++++++++++++++++++++++++++++++++ drivers/base/power/power.h | 3 +++ include/linux/pm.h | 1 + 4 files changed, 42 insertions(+) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index a641cf3ccad6..9e425fbf83cb 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -205,6 +205,8 @@ static void driver_bound(struct device *dev) klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices); + device_pm_check_callbacks(dev); + /* * Make sure the device is no longer in one of the deferred lists and * kick off retrying all pending devices @@ -697,6 +699,7 @@ static void __device_release_driver(struct device *dev) dev->pm_domain->dismiss(dev); klist_remove(&dev->p->knode_driver); + device_pm_check_callbacks(dev); if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_UNBOUND_DRIVER, diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 5ea529165362..77379f3d136a 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -126,6 +126,7 @@ void device_pm_add(struct device *dev) { pr_debug("PM: Adding info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); + device_pm_check_callbacks(dev); mutex_lock(&dpm_list_mtx); if (dev->parent && dev->parent->power.is_prepared) dev_warn(dev, "parent %s should not be sleeping\n", @@ -148,6 +149,7 @@ void device_pm_remove(struct device *dev) mutex_unlock(&dpm_list_mtx); device_wakeup_disable(dev); pm_runtime_remove(dev); + device_pm_check_callbacks(dev); } /** @@ -1574,6 +1576,11 @@ static int device_prepare(struct device *dev, pm_message_t state) dev->power.wakeup_path = device_may_wakeup(dev); + if (dev->power.no_pm_callbacks) { + ret = 1; /* Let device go direct_complete */ + goto unlock; + } + if (dev->pm_domain) { info = "preparing power domain "; callback = dev->pm_domain->ops.prepare; @@ -1596,6 +1603,7 @@ static int device_prepare(struct device *dev, pm_message_t state) if (callback) ret = callback(dev); +unlock: device_unlock(dev); if (ret < 0) { @@ -1724,3 +1732,30 @@ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *)) device_pm_unlock(); } EXPORT_SYMBOL_GPL(dpm_for_each_dev); + +static bool pm_ops_is_empty(const struct dev_pm_ops *ops) +{ + if (!ops) + return true; + + return !ops->prepare && + !ops->suspend && + !ops->suspend_late && + !ops->suspend_noirq && + !ops->resume_noirq && + !ops->resume_early && + !ops->resume && + !ops->complete; +} + +void device_pm_check_callbacks(struct device *dev) +{ + spin_lock_irq(&dev->power.lock); + dev->power.no_pm_callbacks = + (!dev->bus || pm_ops_is_empty(dev->bus->pm)) && + (!dev->class || pm_ops_is_empty(dev->class->pm)) && + (!dev->type || pm_ops_is_empty(dev->type->pm)) && + (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && + (!dev->driver || pm_ops_is_empty(dev->driver->pm)); + spin_unlock_irq(&dev->power.lock); +} diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h index 998fa6b23084..297beae64314 100644 --- a/drivers/base/power/power.h +++ b/drivers/base/power/power.h @@ -123,6 +123,7 @@ extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); +extern void device_pm_check_callbacks(struct device *dev); #else /* !CONFIG_PM_SLEEP */ @@ -141,6 +142,8 @@ static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} +static inline void device_pm_check_callbacks(struct device *dev) {} + #endif /* !CONFIG_PM_SLEEP */ static inline void device_pm_init(struct device *dev) diff --git a/include/linux/pm.h b/include/linux/pm.h index 528be6787796..6a5d654f4447 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -573,6 +573,7 @@ struct dev_pm_info { struct wakeup_source *wakeup; bool wakeup_path:1; bool syscore:1; + bool no_pm_callbacks:1; /* Owned by the PM core */ #else unsigned int should_wakeup:1; #endif From e639a4e74d6cd2c01d4f2afbbe0b6e6e233b42d2 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 27 Jun 2016 13:33:59 -0700 Subject: [PATCH 0155/3220] Revert "usb: gadget: prevent change of Host MAC address of 'usb0' interface" This reverts commit 265801537d110eb68d44a2f66015479908f635c0. Signed-off-by: Badhri Jagan Sridharan --- drivers/usb/gadget/function/u_ether.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index dd73dfe5dcab..74e9f5b5a45d 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -863,8 +863,6 @@ static int eth_stop(struct net_device *net) /*-------------------------------------------------------------------------*/ -static u8 host_ethaddr[ETH_ALEN]; - static int get_ether_addr(const char *str, u8 *dev_addr) { if (str) { @@ -895,17 +893,6 @@ static int get_ether_addr_str(u8 dev_addr[ETH_ALEN], char *str, int len) return 18; } -static int get_host_ether_addr(u8 *str, u8 *dev_addr) -{ - memcpy(dev_addr, str, ETH_ALEN); - if (is_valid_ether_addr(dev_addr)) - return 0; - - random_ether_addr(dev_addr); - memcpy(str, dev_addr, ETH_ALEN); - return 1; -} - static const struct net_device_ops eth_netdev_ops = { .ndo_open = eth_open, .ndo_stop = eth_stop, @@ -963,11 +950,9 @@ struct eth_dev *gether_setup_name(struct usb_gadget *g, if (get_ether_addr(dev_addr, net->dev_addr)) dev_warn(&g->dev, "using random %s ethernet address\n", "self"); - - if (get_host_ether_addr(host_ethaddr, dev->host_mac)) - dev_warn(&g->dev, "using random %s ethernet address\n", "host"); - else - dev_warn(&g->dev, "using previous %s ethernet address\n", "host"); + if (get_ether_addr(host_addr, dev->host_mac)) + dev_warn(&g->dev, + "using random %s ethernet address\n", "host"); if (ethaddr) memcpy(ethaddr, dev->host_mac, ETH_ALEN); From 074c908972a215f63bb5409f6a317020fb137d5b Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 22 Jun 2016 16:49:48 +0800 Subject: [PATCH 0156/3220] netfilter: xt_quota2: make quota2_log work well In upstream commit 7200135bc1e61f1437dc326ae2ef2f310c50b4eb (netfilter: kill ulog targets) http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7200135bc1e6 ipt_ULOG target was removed, meanwhile, the IP_NF_TARGET_ULOG Kconfig and ipt_ULOG.h header file were removed too. This causes we cannot enable QUOTA2_LOG, and netd complains this error: "Unable to open quota socket". So when we reach the quota2 limit, userspace will not be notified with this event. Since IP_NF_TARGET_ULOG was removed, we need not depend on "IP_NF_TARGET_ULOG=n", and for compatibility, add ulog_packet_msg_t related definitions copied from "ipt_ULOG.h". Change-Id: I38132efaabf52bea75dfd736ce734a1b9690e87e Reported-by: Samboo Shen Signed-off-by: Liping Zhang --- net/netfilter/Kconfig | 1 - net/netfilter/xt_quota2.c | 21 ++++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 273f26b67653..1959548b1161 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1354,7 +1354,6 @@ config NETFILTER_XT_MATCH_QUOTA2 config NETFILTER_XT_MATCH_QUOTA2_LOG bool '"quota2" Netfilter LOG support' depends on NETFILTER_XT_MATCH_QUOTA2 - depends on IP_NF_TARGET_ULOG=n # not yes, not module, just no default n help This option allows `quota2' to log ONCE when a quota limit diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c index 99592ae56d9b..834594aa0085 100644 --- a/net/netfilter/xt_quota2.c +++ b/net/netfilter/xt_quota2.c @@ -21,8 +21,27 @@ #include #include + #ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG -#include +/* For compatibility, these definitions are copied from the + * deprecated header file */ +#define ULOG_MAC_LEN 80 +#define ULOG_PREFIX_LEN 32 + +/* Format of the ULOG packets passed through netlink */ +typedef struct ulog_packet_msg { + unsigned long mark; + long timestamp_sec; + long timestamp_usec; + unsigned int hook; + char indev_name[IFNAMSIZ]; + char outdev_name[IFNAMSIZ]; + size_t data_len; + char prefix[ULOG_PREFIX_LEN]; + unsigned char mac_len; + unsigned char mac[ULOG_MAC_LEN]; + unsigned char payload[0]; +} ulog_packet_msg_t; #endif /** From 286d25ba46346d5b7125b2155ce92a63a73f4d7b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 17 Mar 2016 14:20:51 -0700 Subject: [PATCH 0157/3220] BACKPORT: timer: convert timer_slack_ns from unsigned long to u64 This backports da8b44d5a9f8bf26da637b7336508ca534d6b319 from upstream. This patchset introduces a /proc//timerslack_ns interface which would allow controlling processes to be able to set the timerslack value on other processes in order to save power by avoiding wakeups (Something Android currently does via out-of-tree patches). The first patch tries to fix the internal timer_slack_ns usage which was defined as a long, which limits the slack range to ~4 seconds on 32bit systems. It converts it to a u64, which provides the same basically unlimited slack (500 years) on both 32bit and 64bit machines. The second patch introduces the /proc//timerslack_ns interface which allows the full 64bit slack range for a task to be read or set on both 32bit and 64bit machines. With these two patches, on a 32bit machine, after setting the slack on bash to 10 seconds: $ time sleep 1 real 0m10.747s user 0m0.001s sys 0m0.005s The first patch is a little ugly, since I had to chase the slack delta arguments through a number of functions converting them to u64s. Let me know if it makes sense to break that up more or not. Other than that things are fairly straightforward. This patch (of 2): The timer_slack_ns value in the task struct is currently a unsigned long. This means that on 32bit applications, the maximum slack is just over 4 seconds. However, on 64bit machines, its much much larger (~500 years). This disparity could make application development a little (as well as the default_slack) to a u64. This means both 32bit and 64bit systems have the same effective internal slack range. Now the existing ABI via PR_GET_TIMERSLACK and PR_SET_TIMERSLACK specify the interface as a unsigned long, so we preserve that limitation on 32bit systems, where SET_TIMERSLACK can only set the slack to a unsigned long value, and GET_TIMERSLACK will return ULONG_MAX if the slack is actually larger then what can be stored by an unsigned long. This patch also modifies hrtimer functions which specified the slack delta as a unsigned long. Signed-off-by: John Stultz Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Kees Cook Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 2 +- fs/select.c | 8 ++++---- include/linux/freezer.h | 2 +- include/linux/hrtimer.h | 12 +++++++----- include/linux/poll.h | 2 +- include/linux/sched.h | 4 ++-- kernel/sys.c | 5 ++++- kernel/time/hrtimer.c | 8 ++++---- kernel/time/timer.c | 4 ++-- 9 files changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4c999ce7e73a..3ab9c68b8bce 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1588,7 +1588,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; unsigned long flags; - long slack = 0; + u64 slack = 0; wait_queue_t wait; ktime_t expires, *to = NULL; diff --git a/fs/select.c b/fs/select.c index 015547330e88..09e71a00a9b8 100644 --- a/fs/select.c +++ b/fs/select.c @@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -long select_estimate_accuracy(struct timespec *tv) +u64 select_estimate_accuracy(struct timespec *tv) { - unsigned long ret; + u64 ret; struct timespec now; /* @@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; @@ -784,7 +784,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 6b7fd9cf5ea2..dd03e837ebb7 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -231,7 +231,7 @@ static inline long freezable_schedule_timeout_killable_unsafe(long timeout) * call this with locks held. */ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode) + u64 delta, const enum hrtimer_mode mode) { int __retval; freezer_do_not_count(); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..90d8be19505b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -218,7 +218,7 @@ static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time timer->node.expires = ktime_add_safe(time, delta); } -static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta) +static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); @@ -355,7 +355,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long range_ns, const enum hrtimer_mode mode); + u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer on the current CPU @@ -376,7 +376,7 @@ extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { - unsigned long delta; + u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); @@ -449,10 +449,12 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); -extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode, int clock); + u64 delta, + const enum hrtimer_mode mode, + int clock); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ diff --git a/include/linux/poll.h b/include/linux/poll.h index c08386fb3e08..9fb4f40d9a26 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack); -extern long select_estimate_accuracy(struct timespec *tv); +extern u64 select_estimate_accuracy(struct timespec *tv); static inline int poll_schedule(struct poll_wqueues *pwq, int state) diff --git a/include/linux/sched.h b/include/linux/sched.h index fa39434e3fdd..a8012d691abf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1767,8 +1767,8 @@ struct task_struct { * time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ - unsigned long timer_slack_ns; - unsigned long default_timer_slack_ns; + u64 timer_slack_ns; + u64 default_timer_slack_ns; #ifdef CONFIG_KASAN unsigned int kasan_depth; diff --git a/kernel/sys.c b/kernel/sys.c index 11333311cf1c..0c00e563b153 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2319,7 +2319,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; + if (current->timer_slack_ns > ULONG_MAX) + error = ULONG_MAX; + else + error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (arg2 <= 0) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..c98439fdce81 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -963,7 +963,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest * relative (HRTIMER_MODE_REL) */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) + u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -1529,7 +1529,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - unsigned long slack; + u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) @@ -1705,7 +1705,7 @@ void __init hrtimers_init(void) * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1773,7 +1773,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d1114583..d1798fa0c743 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1698,10 +1698,10 @@ EXPORT_SYMBOL(msleep_interruptible); static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; - unsigned long delta; + u64 delta; kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; + delta = (u64)(max - min) * NSEC_PER_USEC; schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } From eb976ed445e57de1e6469b9f3a1aaf33f862c1dd Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 17 Mar 2016 14:20:54 -0700 Subject: [PATCH 0158/3220] BACKPORT: proc: add /proc//timerslack_ns interface This backports 5de23d435e88996b1efe0e2cebe242074ce67c9e This patch provides a proc/PID/timerslack_ns interface which exposes a task's timerslack value in nanoseconds and allows it to be changed. This allows power/performance management software to set timer slack for other threads according to its policy for the thread (such as when the thread is designated foreground vs. background activity) If the value written is non-zero, slack is set to that value. Otherwise sets it to the default for the thread. This interface checks that the calling task has permissions to to use PTRACE_MODE_ATTACH_FSCREDS on the target task, so that we can ensure arbitrary apps do not change the timer slack for other apps. Signed-off-by: John Stultz Acked-by: Kees Cook Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 18 ++++++++ fs/proc/base.c | 67 ++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0bcba2823d9d..1c425191574c 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -43,6 +43,7 @@ Table of Contents 3.7 /proc//task//children - Information about task children 3.8 /proc//fdinfo/ - Information about opened file 3.9 /proc//map_files - Information about memory mapped files + 3.10 /proc//timerslack_ns - Task timerslack value 4 Configuring procfs 4.1 Mount options @@ -1856,6 +1857,23 @@ time one can open(2) mappings from the listings of two processes and comparing their inode numbers to figure out which anonymous memory areas are actually shared. +3.10 /proc//timerslack_ns - Task timerslack value +--------------------------------------------------------- +This file provides the value of the task's timerslack value in nanoseconds. +This value specifies a amount of time that normal timers may be deferred +in order to coalesce timers and avoid unnecessary wakeups. + +This allows a task's interactivity vs power consumption trade off to be +adjusted. + +Writing 0 to the file will set the tasks timerslack to the default value. + +Valid values are from 0 - ULLONG_MAX + +An application setting the value must have PTRACE_MODE_ATTACH_FSCREDS level +permissions on the task specified to change its timerslack_ns value. + + ------------------------------------------------------------------------------ Configuring procfs ------------------------------------------------------------------------------ diff --git a/fs/proc/base.c b/fs/proc/base.c index c21bb4beb0d7..bb51e4ceb15b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2243,6 +2243,72 @@ static const struct file_operations proc_timers_operations = { .release = seq_release_private, }; +static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + u64 slack_ns; + int err; + + err = kstrtoull_from_user(buf, count, 10, &slack_ns); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { + task_lock(p); + if (slack_ns == 0) + p->timer_slack_ns = p->default_timer_slack_ns; + else + p->timer_slack_ns = slack_ns; + task_unlock(p); + } else + count = -EPERM; + + put_task_struct(p); + + return count; +} + +static int timerslack_ns_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + int err = 0; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { + task_lock(p); + seq_printf(m, "%llu\n", p->timer_slack_ns); + task_unlock(p); + } else + err = -EPERM; + + put_task_struct(p); + + return err; +} + +static int timerslack_ns_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timerslack_ns_show, inode); +} + +static const struct file_operations proc_pid_set_timerslack_ns_operations = { + .open = timerslack_ns_open, + .read = seq_read, + .write = timerslack_ns_write, + .llseek = seq_lseek, + .release = single_release, +}; + static int proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -2820,6 +2886,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_CHECKPOINT_RESTORE REG("timers", S_IRUGO, proc_timers_operations), #endif + REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) From e6ec611adde326cd28f0692a263b31821f86b071 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Jan 2016 15:00:04 -0800 Subject: [PATCH 0159/3220] BACKPORT: ptrace: use fsuid, fsgid, effective creds for fs access checks This patch backports 969624b (which backports caaee6234d0 upstream), from the v4.4-stable branch to the common/android-4.4 branch. This patch is needed to provide the PTRACE_MODE_ATTACH_FSCREDS definition which was used by the backported version of proc//timerslack_ns in change-id: Ie5799b9a3402a31f88cd46437dcda4a0e46415a7 commit caaee6234d05a58c5b4d05e7bf766131b810a657 upstream. By checking the effective credentials instead of the real UID / permitted capabilities, ensure that the calling process actually intended to use its credentials. To ensure that all ptrace checks use the correct caller credentials (e.g. in case out-of-tree code or newly added code omits the PTRACE_MODE_*CREDS flag), use two new flags and require one of them to be set. The problem was that when a privileged task had temporarily dropped its privileges, e.g. by calling setreuid(0, user_uid), with the intent to perform following syscalls with the credentials of a user, it still passed ptrace access checks that the user would not be able to pass. While an attacker should not be able to convince the privileged task to perform a ptrace() syscall, this is a problem because the ptrace access check is reused for things in procfs. In particular, the following somewhat interesting procfs entries only rely on ptrace access checks: /proc/$pid/stat - uses the check for determining whether pointers should be visible, useful for bypassing ASLR /proc/$pid/maps - also useful for bypassing ASLR /proc/$pid/cwd - useful for gaining access to restricted directories that contain files with lax permissions, e.g. in this scenario: lrwxrwxrwx root root /proc/13020/cwd -> /root/foobar drwx------ root root /root drwxr-xr-x root root /root/foobar -rw-r--r-- root root /root/foobar/secret Therefore, on a system where a root-owned mode 6755 binary changes its effective credentials as described and then dumps a user-specified file, this could be used by an attacker to reveal the memory layout of root's processes or reveal the contents of files he is not allowed to access (through /proc/$pid/cwd). [akpm@linux-foundation.org: fix warning] Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Casey Schaufler Cc: Oleg Nesterov Cc: Ingo Molnar Cc: James Morris Cc: "Serge E. Hallyn" Cc: Andy Shevchenko Cc: Andy Lutomirski Cc: Al Viro Cc: "Eric W. Biederman" Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman [jstultz: Cherry-picked for common/android-4.4] Signed-off-by: John Stultz --- fs/proc/array.c | 2 +- fs/proc/base.c | 21 +++++++++++---------- fs/proc/namespaces.c | 4 ++-- include/linux/ptrace.h | 24 +++++++++++++++++++++++- kernel/events/core.c | 2 +- kernel/futex.c | 2 +- kernel/futex_compat.c | 2 +- kernel/kcmp.c | 4 ++-- kernel/ptrace.c | 39 +++++++++++++++++++++++++++++++-------- mm/process_vm_access.c | 2 +- security/commoncap.c | 7 ++++++- 11 files changed, 80 insertions(+), 29 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index d73291f5f0fc..b6c00ce0e29e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -395,7 +395,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, state = *get_task_state(task); vsize = eip = esp = 0; - permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); + permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); diff --git a/fs/proc/base.c b/fs/proc/base.c index bb51e4ceb15b..db221e53a1ac 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -403,7 +403,7 @@ static const struct file_operations proc_pid_cmdline_ops = { static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); + struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; do { @@ -430,7 +430,8 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname)) + if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) + && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); else seq_putc(m, '0'); @@ -444,7 +445,7 @@ static int lock_trace(struct task_struct *task) int err = mutex_lock_killable(&task->signal->cred_guard_mutex); if (err) return err; - if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { mutex_unlock(&task->signal->cred_guard_mutex); return -EPERM; } @@ -697,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode) */ task = get_proc_task(inode); if (task) { - allowed = ptrace_may_access(task, PTRACE_MODE_READ); + allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); } return allowed; @@ -732,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid, return true; if (in_group_p(pid->pid_gid)) return true; - return ptrace_may_access(task, PTRACE_MODE_READ); + return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } @@ -809,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) struct mm_struct *mm = ERR_PTR(-ESRCH); if (task) { - mm = mm_access(task, mode); + mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); put_task_struct(task); if (!IS_ERR_OR_NULL(mm)) { @@ -1856,7 +1857,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) if (!task) goto out_notask; - mm = mm_access(task, PTRACE_MODE_READ); + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) goto out; @@ -2007,7 +2008,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out; result = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; result = -ENOENT; @@ -2060,7 +2061,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) goto out; ret = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; ret = 0; @@ -2596,7 +2597,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh if (result) return result; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { result = -EACCES; goto out_unlock; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index f6e8354b8cea..1b0ea4a5d89e 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -42,7 +42,7 @@ static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie) if (!task) return error; - if (ptrace_may_access(task, PTRACE_MODE_READ)) { + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { error = ns_get_path(&ns_path, task, ns_ops); if (!error) nd_jump_link(&ns_path); @@ -63,7 +63,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!task) return res; - if (ptrace_may_access(task, PTRACE_MODE_READ)) { + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 061265f92876..504c98a278d4 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -57,7 +57,29 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 -/* Returns true on success, false on denial. */ +#define PTRACE_MODE_FSCREDS 0x08 +#define PTRACE_MODE_REALCREDS 0x10 + +/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ +#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) +#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) +#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) +#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) + +/** + * ptrace_may_access - check whether the caller is permitted to access + * a target task. + * @task: target task + * @mode: selects type of access and caller credentials + * + * Returns true on success, false on denial. + * + * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must + * be set in @mode to specify whether the access was requested through + * a filesystem syscall (should use effective capabilities and fsuid + * of the caller) or through an explicit syscall such as + * process_vm_writev or ptrace (and should use the real credentials). + */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) diff --git a/kernel/events/core.c b/kernel/events/core.c index 85bc810bece6..2f07e9c34f43 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3439,7 +3439,7 @@ find_lively_task_by_vpid(pid_t vpid) /* Reuse ptrace permission checks for now. */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto errout; return task; diff --git a/kernel/futex.c b/kernel/futex.c index 684d7549825a..495a1d06915b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2881,7 +2881,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 55c8c9349cfe..4ae3232e7a28 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 0aa69ea1d8fd..3a47fa998fe0 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, &task2->signal->cred_guard_mutex); if (ret) goto err; - if (!ptrace_may_access(task1, PTRACE_MODE_READ) || - !ptrace_may_access(task2, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || + !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b760bae64cf1..3189e51db7e8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -219,6 +219,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; + int dumpable = 0; + kuid_t caller_uid; + kgid_t caller_gid; + + if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) { + WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n"); + return -EPERM; + } /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -228,18 +236,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - int dumpable = 0; + /* Don't let security modules deny introspection */ if (same_thread_group(task, current)) return 0; rcu_read_lock(); + if (mode & PTRACE_MODE_FSCREDS) { + caller_uid = cred->fsuid; + caller_gid = cred->fsgid; + } else { + /* + * Using the euid would make more sense here, but something + * in userland might rely on the old behavior, and this + * shouldn't be a security problem since + * PTRACE_MODE_REALCREDS implies that the caller explicitly + * used a syscall that requests access to another process + * (and not a filesystem syscall to procfs). + */ + caller_uid = cred->uid; + caller_gid = cred->gid; + } tcred = __task_cred(task); - if (uid_eq(cred->uid, tcred->euid) && - uid_eq(cred->uid, tcred->suid) && - uid_eq(cred->uid, tcred->uid) && - gid_eq(cred->gid, tcred->egid) && - gid_eq(cred->gid, tcred->sgid) && - gid_eq(cred->gid, tcred->gid)) + if (uid_eq(caller_uid, tcred->euid) && + uid_eq(caller_uid, tcred->suid) && + uid_eq(caller_uid, tcred->uid) && + gid_eq(caller_gid, tcred->egid) && + gid_eq(caller_gid, tcred->sgid) && + gid_eq(caller_gid, tcred->gid)) goto ok; if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; @@ -306,7 +329,7 @@ static int ptrace_attach(struct task_struct *task, long request, goto out; task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); task_unlock(task); if (retval) goto unlock_creds; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e88d071648c2..5d453e58ddbf 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, goto free_proc_pages; } - mm = mm_access(task, PTRACE_MODE_ATTACH); + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* diff --git a/security/commoncap.c b/security/commoncap.c index f035b84b3601..7fa251aea32f 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -148,12 +148,17 @@ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; const struct cred *cred, *child_cred; + const kernel_cap_t *caller_caps; rcu_read_lock(); cred = current_cred(); child_cred = __task_cred(child); + if (mode & PTRACE_MODE_FSCREDS) + caller_caps = &cred->cap_effective; + else + caller_caps = &cred->cap_permitted; if (cred->user_ns == child_cred->user_ns && - cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) + cap_issubset(child_cred->cap_permitted, *caller_caps)) goto out; if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE)) goto out; From 50437c019f146eb138d819d558a3f562419573fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Fri, 8 Jul 2016 13:24:09 -0700 Subject: [PATCH 0160/3220] UPSTREAM: cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry pick from commit 4d06dd537f95683aba3651098ae288b7cbff8274) usbnet_link_change will call schedule_work and should be avoided if bind is failing. Otherwise we will end up with scheduled work referring to a netdev which has gone away. Instead of making the call conditional, we can just defer it to usbnet_probe, using the driver_info flag made for this purpose. Fixes: 8a34b0ae8778 ("usbnet: cdc_ncm: apply usbnet_link_change") Reported-by: Andrey Konovalov Suggested-by: Linus Torvalds Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller Change-Id: Id9a6d02bdd98bf495d26595cf2cc90e480746186 Bug: 28744625 --- drivers/net/usb/cdc_ncm.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index e8a1144c5a8b..93c88a2ce55c 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -941,8 +941,6 @@ EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting); static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) { - int ret; - /* MBIM backwards compatible function? */ if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM) return -ENODEV; @@ -951,16 +949,7 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) * Additionally, generic NCM devices are assumed to accept arbitrarily * placed NDP. */ - ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0); - - /* - * We should get an event when network connection is "connected" or - * "disconnected". Set network connection in "disconnected" state - * (carrier is OFF) during attach, so the IP network stack does not - * start IPv6 negotiation and more. - */ - usbnet_link_change(dev, 0, 0); - return ret; + return cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0); } static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remainder, size_t max) @@ -1543,7 +1532,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb) static const struct driver_info cdc_ncm_info = { .description = "CDC NCM", - .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET, + .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET + | FLAG_LINK_INTR, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, @@ -1556,7 +1546,7 @@ static const struct driver_info cdc_ncm_info = { static const struct driver_info wwan_info = { .description = "Mobile Broadband Network Device", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_WWAN, + | FLAG_LINK_INTR | FLAG_WWAN, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, @@ -1569,7 +1559,7 @@ static const struct driver_info wwan_info = { static const struct driver_info wwan_noarp_info = { .description = "Mobile Broadband Network Device (NO ARP)", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_WWAN | FLAG_NOARP, + | FLAG_LINK_INTR | FLAG_WWAN | FLAG_NOARP, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, From 5b2129ec25a33cda9662232afdfa6af7c9bc168a Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 8 Jul 2016 14:15:14 -0700 Subject: [PATCH 0161/3220] sdcardfs: Truncate packages_gid.list on overflow packages_gid.list was improperly returning the wrong count. Use scnprintf instead, and inform the user that the list was truncated if it is. Bug: 30013843 Change-Id: Ida2b2ef7cd86dd87300bfb4c2cdb6bfe2ee1650d Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/packagelist.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 10f0d6be718b..9c3340528eee 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -335,12 +335,19 @@ static ssize_t packages_attr_show(struct config_item *item, struct hashtable_entry *hash_cur; struct hlist_node *h_t; int i; - int count = 0; - mutex_lock(&pkgl_data_all->hashtable_lock); - hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist) - count += snprintf(page + count, PAGE_SIZE - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value); - mutex_unlock(&pkgl_data_all->hashtable_lock); + int count = 0, written = 0; + char errormsg[] = "\n"; + mutex_lock(&pkgl_data_all->hashtable_lock); + hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist) { + written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value); + if (count + written == PAGE_SIZE - sizeof(errormsg)) { + count += scnprintf(page + count, PAGE_SIZE - count, errormsg); + break; + } + count += written; + } + mutex_unlock(&pkgl_data_all->hashtable_lock); return count; } From 6b72c6ef4442d1e4cbae32839ee69a1b42704d86 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Sun, 12 Jun 2016 17:37:52 -0700 Subject: [PATCH 0162/3220] android-recommended.cfg: enable fstack-protector-strong If compiler has stack protector support, set CONFIG_CC_STACKPROTECTOR_STRONG. Bug: 28967314 Change-Id: I588c2d544250e9e4b5082b43c237b8f85b7313ca Signed-off-by: Jeff Vander Stoep --- android/configs/android-recommended.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index c3222a77ba24..2f1ef077aa9e 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -11,6 +11,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y CONFIG_DEBUG_RODATA=y CONFIG_DM_UEVENT=y From 96f61ff98aab275e0ac7dae4a42d3357e3d6e4a2 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 1 Jun 2016 10:28:49 -0700 Subject: [PATCH 0163/3220] ANDROID: sdcardfs: fix itnull.cocci warnings List_for_each_entry has the property that the first argument is always bound to a real list element, never NULL, so testing dentry is not needed. Generated by: scripts/coccinelle/iterators/itnull.cocci Cc: Daniel Rosenberg Signed-off-by: Julia Lawall Signed-off-by: Fengguang Wu Signed-off-by: Guenter Roeck --- fs/sdcardfs/derived_perm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 128b3e56851f..41e0e11b3c35 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -112,7 +112,7 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry) void get_derive_permissions_recursive(struct dentry *parent) { struct dentry *dentry; list_for_each_entry(dentry, &parent->d_subdirs, d_child) { - if (dentry && dentry->d_inode) { + if (dentry->d_inode) { mutex_lock(&dentry->d_inode->i_mutex); get_derived_permission(parent, dentry); fix_derived_permission(dentry->d_inode); From 9cb6c482ae8032970efe10f6f15d3bbee7795fb4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:52 +0100 Subject: [PATCH 0164/3220] UPSTREAM: netfilter: x_tables: fix unconditional helper (cherry pick from commit 54d83fc74aa9ec72794373cb47432c5f7fb1a309) Ben Hawkes says: In the mark_source_chains function (net/ipv4/netfilter/ip_tables.c) it is possible for a user-supplied ipt_entry structure to have a large next_offset field. This field is not bounds checked prior to writing a counter value at the supplied offset. Problem is that mark_source_chains should not have been called -- the rule doesn't have a next entry, so its supposed to return an absolute verdict of either ACCEPT or DROP. However, the function conditional() doesn't work as the name implies. It only checks that the rule is using wildcard address matching. However, an unconditional rule must also not be using any matches (no -m args). The underflow validator only checked the addresses, therefore passing the 'unconditional absolute verdict' test, while mark_source_chains also tested for presence of matches, and thus proceeeded to the next (not-existent) rule. Unify this so that all the callers have same idea of 'unconditional rule'. Reported-by: Ben Hawkes Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Change-Id: I47ec0713ac563ac244200c7b2c54f09a91aceabc Bug: 28940694 --- net/ipv4/netfilter/arp_tables.c | 18 +++++++++--------- net/ipv4/netfilter/ip_tables.c | 23 +++++++++++------------ net/ipv6/netfilter/ip6_tables.c | 23 +++++++++++------------ 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 11dccba474b7..e5697e649ce1 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -359,11 +359,12 @@ unsigned int arpt_do_table(struct sk_buff *skb, } /* All zeroes == unconditional rule. */ -static inline bool unconditional(const struct arpt_arp *arp) +static inline bool unconditional(const struct arpt_entry *e) { static const struct arpt_arp uncond; - return memcmp(arp, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct arpt_entry) && + memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } /* Figures out from what hook each rule can be called: returns 0 if @@ -402,11 +403,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo, |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct arpt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->arp)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -557,7 +557,7 @@ static bool check_underflow(const struct arpt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->arp)) + if (!unconditional(e)) return false; t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -598,9 +598,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b99affad6ba1..2bf6b3f1a00c 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -168,11 +168,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ipt_ip *ip) +static inline bool unconditional(const struct ipt_entry *e) { static const struct ipt_ip uncond; - return memcmp(ip, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ipt_entry) && + memcmp(&e->ip, &uncond, sizeof(uncond)) == 0; #undef FWINV } @@ -229,11 +230,10 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ipt_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ip)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP_TRACE_COMMENT_POLICY] @@ -476,11 +476,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ipt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->ip)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -721,7 +720,7 @@ static bool check_underflow(const struct ipt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ip)) + if (!unconditional(e)) return false; t = ipt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -763,9 +762,9 @@ check_entry_size_and_hooks(struct ipt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 99425cf2819b..37e945cfa5da 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -198,11 +198,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ip6t_ip6 *ipv6) +static inline bool unconditional(const struct ip6t_entry *e) { static const struct ip6t_ip6 uncond; - return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ip6t_entry) && + memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0; } static inline const struct xt_entry_target * @@ -258,11 +259,10 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ip6t_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ipv6)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP6_TRACE_COMMENT_POLICY] @@ -488,11 +488,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ip6t_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && - unconditional(&e->ipv6)) || visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -733,7 +732,7 @@ static bool check_underflow(const struct ip6t_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ipv6)) + if (!unconditional(e)) return false; t = ip6t_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -775,9 +774,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; From f8a27f34070e29bdbbe88f5330f77d8ac5d1c2fb Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 23 Mar 2016 16:38:55 +0100 Subject: [PATCH 0165/3220] UPSTREAM: ppp: take reference on channels netns (cherry pick from commit 1f461dcdd296eecedaffffc6bae2bfa90bd7eb89) Let channels hold a reference on their network namespace. Some channel types, like ppp_async and ppp_synctty, can have their userspace controller running in a different namespace. Therefore they can't rely on them to preclude their netns from being removed from under them. ================================================================== BUG: KASAN: use-after-free in ppp_unregister_channel+0x372/0x3a0 at addr ffff880064e217e0 Read of size 8 by task syz-executor/11581 ============================================================================= BUG net_namespace (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in copy_net_ns+0x6b/0x1a0 age=92569 cpu=3 pid=6906 [< none >] ___slab_alloc+0x4c7/0x500 kernel/mm/slub.c:2440 [< none >] __slab_alloc+0x4c/0x90 kernel/mm/slub.c:2469 [< inline >] slab_alloc_node kernel/mm/slub.c:2532 [< inline >] slab_alloc kernel/mm/slub.c:2574 [< none >] kmem_cache_alloc+0x23a/0x2b0 kernel/mm/slub.c:2579 [< inline >] kmem_cache_zalloc kernel/include/linux/slab.h:597 [< inline >] net_alloc kernel/net/core/net_namespace.c:325 [< none >] copy_net_ns+0x6b/0x1a0 kernel/net/core/net_namespace.c:360 [< none >] create_new_namespaces+0x2f6/0x610 kernel/kernel/nsproxy.c:95 [< none >] copy_namespaces+0x297/0x320 kernel/kernel/nsproxy.c:150 [< none >] copy_process.part.35+0x1bf4/0x5760 kernel/kernel/fork.c:1451 [< inline >] copy_process kernel/kernel/fork.c:1274 [< none >] _do_fork+0x1bc/0xcb0 kernel/kernel/fork.c:1723 [< inline >] SYSC_clone kernel/kernel/fork.c:1832 [< none >] SyS_clone+0x37/0x50 kernel/kernel/fork.c:1826 [< none >] entry_SYSCALL_64_fastpath+0x16/0x7a kernel/arch/x86/entry/entry_64.S:185 INFO: Freed in net_drop_ns+0x67/0x80 age=575 cpu=2 pid=2631 [< none >] __slab_free+0x1fc/0x320 kernel/mm/slub.c:2650 [< inline >] slab_free kernel/mm/slub.c:2805 [< none >] kmem_cache_free+0x2a0/0x330 kernel/mm/slub.c:2814 [< inline >] net_free kernel/net/core/net_namespace.c:341 [< none >] net_drop_ns+0x67/0x80 kernel/net/core/net_namespace.c:348 [< none >] cleanup_net+0x4e5/0x600 kernel/net/core/net_namespace.c:448 [< none >] process_one_work+0x794/0x1440 kernel/kernel/workqueue.c:2036 [< none >] worker_thread+0xdb/0xfc0 kernel/kernel/workqueue.c:2170 [< none >] kthread+0x23f/0x2d0 kernel/drivers/block/aoe/aoecmd.c:1303 [< none >] ret_from_fork+0x3f/0x70 kernel/arch/x86/entry/entry_64.S:468 INFO: Slab 0xffffea0001938800 objects=3 used=0 fp=0xffff880064e20000 flags=0x5fffc0000004080 INFO: Object 0xffff880064e20000 @offset=0 fp=0xffff880064e24200 CPU: 1 PID: 11581 Comm: syz-executor Tainted: G B 4.4.0+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014 00000000ffffffff ffff8800662c7790 ffffffff8292049d ffff88003e36a300 ffff880064e20000 ffff880064e20000 ffff8800662c77c0 ffffffff816f2054 ffff88003e36a300 ffffea0001938800 ffff880064e20000 0000000000000000 Call Trace: [< inline >] __dump_stack kernel/lib/dump_stack.c:15 [] dump_stack+0x6f/0xa2 kernel/lib/dump_stack.c:50 [] print_trailer+0xf4/0x150 kernel/mm/slub.c:654 [] object_err+0x2f/0x40 kernel/mm/slub.c:661 [< inline >] print_address_description kernel/mm/kasan/report.c:138 [] kasan_report_error+0x215/0x530 kernel/mm/kasan/report.c:236 [< inline >] kasan_report kernel/mm/kasan/report.c:259 [] __asan_report_load8_noabort+0x3e/0x40 kernel/mm/kasan/report.c:280 [< inline >] ? ppp_pernet kernel/include/linux/compiler.h:218 [] ? ppp_unregister_channel+0x372/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392 [< inline >] ppp_pernet kernel/include/linux/compiler.h:218 [] ppp_unregister_channel+0x372/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392 [< inline >] ? ppp_pernet kernel/drivers/net/ppp/ppp_generic.c:293 [] ? ppp_unregister_channel+0xe6/0x3a0 kernel/drivers/net/ppp/ppp_generic.c:2392 [] ppp_asynctty_close+0xa3/0x130 kernel/drivers/net/ppp/ppp_async.c:241 [] ? async_lcp_peek+0x5b0/0x5b0 kernel/drivers/net/ppp/ppp_async.c:1000 [] tty_ldisc_close.isra.1+0x99/0xe0 kernel/drivers/tty/tty_ldisc.c:478 [] tty_ldisc_kill+0x40/0x170 kernel/drivers/tty/tty_ldisc.c:744 [] tty_ldisc_release+0x1b3/0x260 kernel/drivers/tty/tty_ldisc.c:772 [] tty_release+0xac1/0x13e0 kernel/drivers/tty/tty_io.c:1901 [] ? release_tty+0x320/0x320 kernel/drivers/tty/tty_io.c:1688 [] __fput+0x236/0x780 kernel/fs/file_table.c:208 [] ____fput+0x15/0x20 kernel/fs/file_table.c:244 [] task_work_run+0x16b/0x200 kernel/kernel/task_work.c:115 [< inline >] exit_task_work kernel/include/linux/task_work.h:21 [] do_exit+0x8b5/0x2c60 kernel/kernel/exit.c:750 [] ? debug_check_no_locks_freed+0x290/0x290 kernel/kernel/locking/lockdep.c:4123 [] ? mm_update_next_owner+0x6f0/0x6f0 kernel/kernel/exit.c:357 [] ? __dequeue_signal+0x136/0x470 kernel/kernel/signal.c:550 [] ? recalc_sigpending_tsk+0x13b/0x180 kernel/kernel/signal.c:145 [] do_group_exit+0x108/0x330 kernel/kernel/exit.c:880 [] get_signal+0x5e4/0x14f0 kernel/kernel/signal.c:2307 [< inline >] ? kretprobe_table_lock kernel/kernel/kprobes.c:1113 [] ? kprobe_flush_task+0xb5/0x450 kernel/kernel/kprobes.c:1158 [] do_signal+0x83/0x1c90 kernel/arch/x86/kernel/signal.c:712 [] ? recycle_rp_inst+0x310/0x310 kernel/include/linux/list.h:655 [] ? setup_sigcontext+0x780/0x780 kernel/arch/x86/kernel/signal.c:165 [] ? finish_task_switch+0x424/0x5f0 kernel/kernel/sched/core.c:2692 [< inline >] ? finish_lock_switch kernel/kernel/sched/sched.h:1099 [] ? finish_task_switch+0x120/0x5f0 kernel/kernel/sched/core.c:2678 [< inline >] ? context_switch kernel/kernel/sched/core.c:2807 [] ? __schedule+0x919/0x1bd0 kernel/kernel/sched/core.c:3283 [] exit_to_usermode_loop+0xf1/0x1a0 kernel/arch/x86/entry/common.c:247 [< inline >] prepare_exit_to_usermode kernel/arch/x86/entry/common.c:282 [] syscall_return_slowpath+0x19f/0x210 kernel/arch/x86/entry/common.c:344 [] int_ret_from_sys_call+0x25/0x9f kernel/arch/x86/entry/entry_64.S:281 Memory state around the buggy address: ffff880064e21680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff880064e21700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff880064e21780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff880064e21800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff880064e21880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Fixes: 273ec51dd7ce ("net: ppp_generic - introduce net-namespace functionality v2") Reported-by: Baozeng Ding Signed-off-by: Guillaume Nault Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller Change-Id: Iee0015eca5bd181954bb4896a3720f7549c5ed0b Bug: 28979703 --- drivers/net/ppp/ppp_generic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 9a863c6a6a33..afe6fddd2baa 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2290,7 +2290,7 @@ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan) pch->ppp = NULL; pch->chan = chan; - pch->chan_net = net; + pch->chan_net = get_net(net); chan->ppp = pch; init_ppp_file(&pch->file, CHANNEL); pch->file.hdrlen = chan->hdrlen; @@ -2387,6 +2387,8 @@ ppp_unregister_channel(struct ppp_channel *chan) spin_lock_bh(&pn->all_channels_lock); list_del(&pch->list); spin_unlock_bh(&pn->all_channels_lock); + put_net(pch->chan_net); + pch->chan_net = NULL; pch->file.dead = 1; wake_up_interruptible(&pch->file.rwait); From 1d2d5ceaf5ae9d41656a084d394a1e38f1a80d3c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 23 Feb 2016 11:03:12 +0000 Subject: [PATCH 0166/3220] UPSTREAM: KEYS: Fix ASN.1 indefinite length object parsing (cherry pick from commit 23c8a812dc3c621009e4f0e5342aa4e2ede1ceaa) This fixes CVE-2016-0758. In the ASN.1 decoder, when the length field of an ASN.1 value is extracted, it isn't validated against the remaining amount of data before being added to the cursor. With a sufficiently large size indicated, the check: datalen - dp < 2 may then fail due to integer overflow. Fix this by checking the length indicated against the amount of remaining data in both places a definite length is determined. Whilst we're at it, make the following changes: (1) Check the maximum size of extended length does not exceed the capacity of the variable it's being stored in (len) rather than the type that variable is assumed to be (size_t). (2) Compare the EOC tag to the symbolic constant ASN1_EOC rather than the integer 0. (3) To reduce confusion, move the initialisation of len outside of: for (len = 0; n > 0; n--) { since it doesn't have anything to do with the loop counter n. Signed-off-by: David Howells Reviewed-by: Mimi Zohar Acked-by: David Woodhouse Acked-by: Peter Jones Change-Id: If760bc3b8ab0e59fefc24fa687514324348fb8e8 Bug: 29814470 --- lib/asn1_decoder.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 2b3f46c049d4..554522934c44 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -74,7 +74,7 @@ next_tag: /* Extract a tag from the data */ tag = data[dp++]; - if (tag == 0) { + if (tag == ASN1_EOC) { /* It appears to be an EOC. */ if (data[dp++] != 0) goto invalid_eoc; @@ -96,10 +96,8 @@ next_tag: /* Extract the length */ len = data[dp++]; - if (len <= 0x7f) { - dp += len; - goto next_tag; - } + if (len <= 0x7f) + goto check_length; if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ @@ -110,14 +108,18 @@ next_tag: } n = len - 0x80; - if (unlikely(n > sizeof(size_t) - 1)) + if (unlikely(n > sizeof(len) - 1)) goto length_too_long; if (unlikely(n > datalen - dp)) goto data_overrun_error; - for (len = 0; n > 0; n--) { + len = 0; + for (; n > 0; n--) { len <<= 8; len |= data[dp++]; } +check_length: + if (len > datalen - dp) + goto data_overrun_error; dp += len; goto next_tag; From 2e2e345376a1e0320956512d9ccfa8d9dac60d27 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Jun 2016 15:48:57 +0100 Subject: [PATCH 0167/3220] UPSTREAM: KEYS: potential uninitialized variable (cherry picked from commit 38327424b40bcebe2de92d07312c89360ac9229a) If __key_link_begin() failed then "edit" would be uninitialized. I've added a check to fix that. This allows a random user to crash the kernel, though it's quite difficult to achieve. There are three ways it can be done as the user would have to cause an error to occur in __key_link(): (1) Cause the kernel to run out of memory. In practice, this is difficult to achieve without ENOMEM cropping up elsewhere and aborting the attempt. (2) Revoke the destination keyring between the keyring ID being looked up and it being tested for revocation. In practice, this is difficult to time correctly because the KEYCTL_REJECT function can only be used from the request-key upcall process. Further, users can only make use of what's in /sbin/request-key.conf, though this does including a rejection debugging test - which means that the destination keyring has to be the caller's session keyring in practice. (3) Have just enough key quota available to create a key, a new session keyring for the upcall and a link in the session keyring, but not then sufficient quota to create a link in the nominated destination keyring so that it fails with EDQUOT. The bug can be triggered using option (3) above using something like the following: echo 80 >/proc/sys/kernel/keys/root_maxbytes keyctl request2 user debug:fred negate @t The above sets the quota to something much lower (80) to make the bug easier to trigger, but this is dependent on the system. Note also that the name of the keyring created contains a random number that may be between 1 and 10 characters in size, so may throw the test off by changing the amount of quota used. Assuming the failure occurs, something like the following will be seen: kfree_debugcheck: out of range ptr 6b6b6b6b6b6b6b68h ------------[ cut here ]------------ kernel BUG at ../mm/slab.c:2821! ... RIP: 0010:[] kfree_debugcheck+0x20/0x25 RSP: 0018:ffff8804014a7de8 EFLAGS: 00010092 RAX: 0000000000000034 RBX: 6b6b6b6b6b6b6b68 RCX: 0000000000000000 RDX: 0000000000040001 RSI: 00000000000000f6 RDI: 0000000000000300 RBP: ffff8804014a7df0 R08: 0000000000000001 R09: 0000000000000000 R10: ffff8804014a7e68 R11: 0000000000000054 R12: 0000000000000202 R13: ffffffff81318a66 R14: 0000000000000000 R15: 0000000000000001 ... Call Trace: kfree+0xde/0x1bc assoc_array_cancel_edit+0x1f/0x36 __key_link_end+0x55/0x63 key_reject_and_link+0x124/0x155 keyctl_reject_key+0xb6/0xe0 keyctl_negate_key+0x10/0x12 SyS_keyctl+0x9f/0xe7 do_syscall_64+0x63/0x13a entry_SYSCALL64_slow_path+0x25/0x25 Fixes: f70e2e06196a ('KEYS: Do preallocation for __key_link()') Signed-off-by: Dan Carpenter Signed-off-by: David Howells cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Change-Id: Ia9616cce142a616beea0ef20bde49129939d2d2d Bug: 29823941 --- security/keys/key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/key.c b/security/keys/key.c index ab7997ded725..534808915371 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -578,7 +578,7 @@ int key_reject_and_link(struct key *key, mutex_unlock(&key_construction_mutex); - if (keyring) + if (keyring && link_ret == 0) __key_link_end(keyring, &key->index_key, edit); /* wake up anyone waiting for a key to be constructed */ From 44bd40ae321b27fe6b5e107d207a79fe1cbaa507 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:49 +0100 Subject: [PATCH 0168/3220] UPSTREAM: netfilter: x_tables: validate e->target_offset early (cherry pick from commit bdf533de6968e9686df777dc178486f600c6e617) We should check that e->target_offset is sane before mark_source_chains gets called since it will fetch the target entry for loop detection. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Change-Id: Ic2dbc31c9525d698e94d4d8875886acf3524abbd Bug: 29637687 --- net/ipv4/netfilter/arp_tables.c | 17 ++++++++--------- net/ipv4/netfilter/ip_tables.c | 17 ++++++++--------- net/ipv6/netfilter/ip6_tables.c | 17 ++++++++--------- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e5697e649ce1..449e2b56b3b2 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -474,14 +474,12 @@ next: return 1; } -static inline int check_entry(const struct arpt_entry *e, const char *name) +static inline int check_entry(const struct arpt_entry *e) { const struct xt_entry_target *t; - if (!arp_checkentry(&e->arp)) { - duprintf("arp_tables: arp check failed %p %s.\n", e, name); + if (!arp_checkentry(&e->arp)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; @@ -522,10 +520,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) struct xt_target *target; int ret; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -576,6 +570,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { @@ -590,6 +585,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_ARP_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1246,7 +1245,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct arpt_entry *)e, name); + ret = check_entry((struct arpt_entry *)e); if (ret) return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2bf6b3f1a00c..c3ab1ab12ee6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -568,14 +568,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ipt_entry *e, const char *name) +check_entry(const struct ipt_entry *e) { const struct xt_entry_target *t; - if (!ip_checkentry(&e->ip)) { - duprintf("ip check failed %p %s.\n", e, name); + if (!ip_checkentry(&e->ip)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -665,10 +663,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -740,6 +734,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { @@ -754,6 +749,10 @@ check_entry_size_and_hooks(struct ipt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1505,7 +1504,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ipt_entry *)e, name); + ret = check_entry((struct ipt_entry *)e); if (ret) return ret; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 37e945cfa5da..f532cb6d9c3d 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -580,14 +580,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ip6t_entry *e, const char *name) +check_entry(const struct ip6t_entry *e) { const struct xt_entry_target *t; - if (!ip6_checkentry(&e->ipv6)) { - duprintf("ip_tables: ip check failed %p %s.\n", e, name); + if (!ip6_checkentry(&e->ipv6)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -678,10 +676,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -752,6 +746,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { @@ -766,6 +761,10 @@ check_entry_size_and_hooks(struct ip6t_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -1517,7 +1516,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ip6t_entry *)e, name); + ret = check_entry((struct ip6t_entry *)e); if (ret) return ret; From 160202a2e8a37baf7a1dcd3b3748a19ece5afe3e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2016 18:02:50 +0100 Subject: [PATCH 0169/3220] UPSTREAM: netfilter: x_tables: make sure e->next_offset covers remaining blob size (cherry pick from commit 6e94e0cfb0887e4013b3b930fa6ab1fe6bb6ba91) Otherwise this function may read data beyond the ruleset blob. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Change-Id: I9d19ecf3e00a2d52817b35b9042623927895c005 Bug: 29637687 --- net/ipv4/netfilter/arp_tables.c | 6 ++++-- net/ipv4/netfilter/ip_tables.c | 6 ++++-- net/ipv6/netfilter/ip6_tables.c | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 449e2b56b3b2..36a30fab8625 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -573,7 +573,8 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1232,7 +1233,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c3ab1ab12ee6..99d46b0a4ead 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -737,7 +737,8 @@ check_entry_size_and_hooks(struct ipt_entry *e, int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1491,7 +1492,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index f532cb6d9c3d..6198807e06f4 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -749,7 +749,8 @@ check_entry_size_and_hooks(struct ip6t_entry *e, int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || - (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -1503,7 +1504,8 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } From 71b1886054473597c46a8a25c95477b5262971b5 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Fri, 23 Oct 2015 15:13:42 -0700 Subject: [PATCH 0170/3220] CHROMIUM: android: binder: Fix potential scheduling-while-atomic (cherry picked from commit 166b45af97359159f9585a836c9849e725e31fd6) Commit f1e7f0a724f6 ("android: binder: Disable preemption while holding the global binder lock.") re-enabled preemption around most of the sites where calls to potentially sleeping functions were made, but missed __alloc_fd(), which can sleep if the fdtable needs to be resized. Re-enable preemption around __alloc_fd() as well as __fd_install() which can now sleep in upstream kernels as of commit 8a81252b774b ("fs/file.c: don't acquire files->file_lock in fd_install()"). BUG=chrome-os-partner:44012 TEST=Build and boot on Smaug. Change-Id: I9819c4b95876f697e75b1b84810b6c520d9c33ec Signed-off-by: Andrew Bresticker Reviewed-on: https://chromium-review.googlesource.com/308582 Reviewed-by: Stephen Barber Reviewed-by: Riley Andrews Bug: 30141999 --- drivers/android/binder.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index f0ce9959d14d..b66fe6545f32 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -379,6 +379,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) struct files_struct *files = proc->files; unsigned long rlim_cur; unsigned long irqs; + int ret; if (files == NULL) return -ESRCH; @@ -389,7 +390,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); unlock_task_sighand(proc->tsk, &irqs); - return __alloc_fd(files, 0, rlim_cur, flags); + preempt_enable_no_resched(); + ret = __alloc_fd(files, 0, rlim_cur, flags); + preempt_disable(); + + return ret; } /* @@ -398,8 +403,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { - if (proc->files) + if (proc->files) { + preempt_enable_no_resched(); __fd_install(proc->files, fd, file); + preempt_disable(); + } } /* From 13c17d0179f9a055062f37e91a6f6cf00a249ebd Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Mon, 18 Jul 2016 22:21:12 +0000 Subject: [PATCH 0171/3220] Revert "CHROMIUM: android: binder: Fix potential scheduling-while-atomic" This reverts commit 71b1886054473597c46a8a25c95477b5262971b5. Change-Id: I9ded0ff43535c1367c2cf79dfeec20d4b5f0357a --- drivers/android/binder.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index b66fe6545f32..f0ce9959d14d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -379,7 +379,6 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) struct files_struct *files = proc->files; unsigned long rlim_cur; unsigned long irqs; - int ret; if (files == NULL) return -ESRCH; @@ -390,11 +389,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); unlock_task_sighand(proc->tsk, &irqs); - preempt_enable_no_resched(); - ret = __alloc_fd(files, 0, rlim_cur, flags); - preempt_disable(); - - return ret; + return __alloc_fd(files, 0, rlim_cur, flags); } /* @@ -403,11 +398,8 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { - if (proc->files) { - preempt_enable_no_resched(); + if (proc->files) __fd_install(proc->files, fd, file); - preempt_disable(); - } } /* From dd802a9c976c9e098d236c551a0c3cb2c6f5271d Mon Sep 17 00:00:00 2001 From: Riley Andrews Date: Fri, 5 Jun 2015 18:59:29 -0700 Subject: [PATCH 0172/3220] cpuset: Add allow_attach hook for cpusets on android. This patch provides a allow_attach hook for cpusets, which resolves lots of the following logcat noise. W SchedPolicy: add_tid_to_cgroup failed to write '2816' (Permission denied); fd=29 W ActivityManager: Failed setting process group of 2816 to 0 W System.err: java.lang.IllegalArgumentException W System.err: at android.os.Process.setProcessGroup(Native Method) W System.err: at com.android.server.am.ActivityManagerService.applyOomAdjLocked(ActivityManagerService.java:18763) W System.err: at com.android.server.am.ActivityManagerService.updateOomAdjLocked(ActivityManagerService.java:19028) W System.err: at com.android.server.am.ActivityManagerService.updateOomAdjLocked(ActivityManagerService.java:19106) W System.err: at com.android.server.am.ActiveServices.serviceDoneExecutingLocked(ActiveServices.java:2015) W System.err: at com.android.server.am.ActiveServices.publishServiceLocked(ActiveServices.java:905) W System.err: at com.android.server.am.ActivityManagerService.publishService(ActivityManagerService.java:16065) W System.err: at android.app.ActivityManagerNative.onTransact(ActivityManagerNative.java:1007) W System.err: at com.android.server.am.ActivityManagerService.onTransact(ActivityManagerService.java:2493) W System.err: at android.os.Binder.execTransact(Binder.java:453) Change-Id: Ic1b61b2bbb7ce74c9e9422b5e22ee9078251de21 [Ported to 4.4, added commit message] Signed-off-by: John Stultz --- kernel/cpuset.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02a8ea5c9963..0ff01c2a7222 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2051,12 +2051,30 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +static int cpuset_allow_attach(struct cgroup_taskset *tset) +{ + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(task, css, tset) { + tcred = __task_cred(task); + + if ((current != task) && !capable(CAP_SYS_ADMIN) && + cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val) + return -EACCES; + } + + return 0; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, + .allow_attach = cpuset_allow_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .bind = cpuset_bind, From f394c99b6ac0236d7e06ee406107294d5f389259 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 5 Jul 2016 22:12:36 -0700 Subject: [PATCH 0173/3220] UPSTREAM: ppp: defer netns reference release for ppp channel (cherry pick from commit 205e1e255c479f3fd77446415706463b282f94e4) Matt reported that we have a NULL pointer dereference in ppp_pernet() from ppp_connect_channel(), i.e. pch->chan_net is NULL. This is due to that a parallel ppp_unregister_channel() could happen while we are in ppp_connect_channel(), during which pch->chan_net set to NULL. Since we need a reference to net per channel, it makes sense to sync the refcnt with the life time of the channel, therefore we should release this reference when we destroy it. Fixes: 1f461dcdd296 ("ppp: take reference on channels netns") Reported-by: Matt Bennett Cc: Paul Mackerras Cc: linux-ppp@vger.kernel.org Cc: Guillaume Nault Cc: Cyrill Gorcunov Signed-off-by: Cong Wang Reviewed-by: Cyrill Gorcunov Signed-off-by: David S. Miller Fixes: Change-Id: Iee0015eca5bd181954bb4896a3720f7549c5ed0b ("UPSTREAM: ppp: take reference on channels netns") Signed-off-by: Amit Pundir Change-Id: I24d0bb6f349ab3829f63cfe935ed97b6913a3508 --- drivers/net/ppp/ppp_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index afe6fddd2baa..50cda254b57a 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2387,8 +2387,6 @@ ppp_unregister_channel(struct ppp_channel *chan) spin_lock_bh(&pn->all_channels_lock); list_del(&pch->list); spin_unlock_bh(&pn->all_channels_lock); - put_net(pch->chan_net); - pch->chan_net = NULL; pch->file.dead = 1; wake_up_interruptible(&pch->file.rwait); @@ -2980,6 +2978,9 @@ ppp_disconnect_channel(struct channel *pch) */ static void ppp_destroy_channel(struct channel *pch) { + put_net(pch->chan_net); + pch->chan_net = NULL; + atomic_dec(&channel_count); if (!pch->file.dead) { From 53c39433d5fd7d71bec7e904a6fa6691a9e90fdc Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:05 +0200 Subject: [PATCH 0174/3220] UPSTREAM: proc: prevent stacking filesystems on top (cherry picked from commit e54ad7f1ee263ffa5a2de9c609d58dfa27b21cd9) This prevents stacking filesystems (ecryptfs and overlayfs) from using procfs as lower filesystem. There is too much magic going on inside procfs, and there is no good reason to stack stuff on top of procfs. (For example, procfs does access checks in VFS open handlers, and ecryptfs by design calls open handlers from a kernel thread that doesn't drop privileges or so.) Signed-off-by: Jann Horn Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Change-Id: Ib050ef9dc10e623589d22e3a9e6aee9ee4f0cd5d Bug: 29444228 --- fs/proc/root.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/proc/root.c b/fs/proc/root.c index 361ab4ee42fc..ec649c92d270 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -121,6 +121,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) return ERR_CAST(sb); + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + if (!proc_parse_options(options, ns)) { deactivate_locked_super(sb); return ERR_PTR(-EINVAL); From ed23e1145053fb43639a178e89e99bb14f11272b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:06 +0200 Subject: [PATCH 0175/3220] UPSTREAM: ecryptfs: forbid opening files without mmap handler (cherry picked from commit 2f36db71009304b3f0b95afacd8eba1f9f046b87) This prevents users from triggering a stack overflow through a recursive invocation of pagefault handling that involves mapping procfs files into virtual memory. Signed-off-by: Jann Horn Acked-by: Tyler Hicks Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds Change-Id: I0be77c7f8bd3046bc34cd87ef577529792d479bc Bug: 29444228 --- fs/ecryptfs/kthread.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index 866bb18efefe..e818f5ac7a26 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "ecryptfs_kernel.h" struct ecryptfs_open_req { @@ -147,7 +148,7 @@ int ecryptfs_privileged_open(struct file **lower_file, flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) - goto out; + goto have_file; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; @@ -165,8 +166,16 @@ int ecryptfs_privileged_open(struct file **lower_file, mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); wait_for_completion(&req.done); - if (IS_ERR(*lower_file)) + if (IS_ERR(*lower_file)) { rc = PTR_ERR(*lower_file); + goto out; + } +have_file: + if ((*lower_file)->f_op->mmap == NULL) { + fput(*lower_file); + *lower_file = NULL; + rc = -EMEDIUMTYPE; + } out: return rc; } From 85a09f2732f9d0bbb9fdec98f292f1db6c75a96a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:07 +0200 Subject: [PATCH 0176/3220] UPSTREAM: sched: panic on corrupted stack end (cherry picked from commit 29d6455178a09e1dc340380c582b13356227e8df) Until now, hitting this BUG_ON caused a recursive oops (because oops handling involves do_exit(), which calls into the scheduler, which in turn raises an oops), which caused stuff below the stack to be overwritten until a panic happened (e.g. via an oops in interrupt context, caused by the overwritten CPU index in the thread_info). Just panic directly. Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds Change-Id: Ia3acb3f747f7a58ec2d071644433b0591925969f Bug: 29444228 --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 61b0914cc7aa..b290af9c37a1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3008,7 +3008,8 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(task_stack_end_corrupted(prev)); + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); #endif if (unlikely(in_atomic_preempt_off())) { From 419d43edc519b477d365b6311c48f875cec6bfdf Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 14 Jul 2016 11:20:55 -0700 Subject: [PATCH 0177/3220] FROMLIST: proc: Relax /proc//timerslack_ns capability requirements When an interface to allow a task to change another tasks timerslack was first proposed, it was suggested that something greater then CAP_SYS_NICE would be needed, as a task could be delayed further then what normally could be done with nice adjustments. So CAP_SYS_PTRACE was adopted instead for what became the /proc//timerslack_ns interface. However, for Android (where this feature originates), giving the system_server CAP_SYS_PTRACE would allow it to observe and modify all tasks memory. This is considered too high a privilege level for only needing to change the timerslack. After some discussion, it was realized that a CAP_SYS_NICE process can set a task as SCHED_FIFO, so they could fork some spinning processes and set them all SCHED_FIFO 99, in effect delaying all other tasks for an infinite amount of time. So as a CAP_SYS_NICE task can already cause trouble for other tasks, using it as a required capability for accessing and modifying /proc//timerslack_ns seems sufficient. Thus, this patch loosens the capability requirements to CAP_SYS_NICE and removes CAP_SYS_PTRACE, simplifying some of the code flow as well. This is technically an ABI change, but as the feature just landed in 4.6, I suspect no one is yet using it. Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Andrew Morton Cc: Thomas Gleixner CC: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: Android Kernel Team Reviewed-by: Nick Kralevich Acked-by: Serge Hallyn Acked-by: Kees Cook Signed-off-by: John Stultz FROMLIST URL: https://lkml.org/lkml/2016/7/21/522 Change-Id: Ia75481402e3948165a1b7c1551c539530cb25509 (Cherry picked against common/android-4.4) Signed-off-by: John Stultz --- fs/proc/base.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index db221e53a1ac..ee1b021e153e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2260,16 +2260,19 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { - task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; - task_unlock(p); - } else + if (!capable(CAP_SYS_NICE)) { count = -EPERM; + goto out; + } + task_lock(p); + if (slack_ns == 0) + p->timer_slack_ns = p->default_timer_slack_ns; + else + p->timer_slack_ns = slack_ns; + task_unlock(p); + +out: put_task_struct(p); return count; @@ -2279,19 +2282,22 @@ static int timerslack_ns_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; - int err = 0; + int err = 0; p = get_proc_task(inode); if (!p) return -ESRCH; - if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { - task_lock(p); - seq_printf(m, "%llu\n", p->timer_slack_ns); - task_unlock(p); - } else + if (!capable(CAP_SYS_NICE)) { err = -EPERM; + goto out; + } + task_lock(p); + seq_printf(m, "%llu\n", p->timer_slack_ns); + task_unlock(p); + +out: put_task_struct(p); return err; From 5d1b9c0c012ff96d543aa90db888ca511f53dfaa Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 14 Jul 2016 17:22:19 -0700 Subject: [PATCH 0178/3220] FROMLIST: proc: Add LSM hook checks to /proc//timerslack_ns As requested, this patch checks the existing LSM hooks task_getscheduler/task_setscheduler when reading or modifying the task's timerslack value. Previous versions added new get/settimerslack LSM hooks, but since they checked the same PROCESS__SET/GETSCHED values as existing hooks, it was suggested we just use the existing ones. Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Andrew Morton Cc: Thomas Gleixner CC: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: James Morris Cc: Android Kernel Team Cc: linux-security-module@vger.kernel.org Cc: selinux@tycho.nsa.gov Signed-off-by: John Stultz FROMLIST URL: https://lkml.org/lkml/2016/7/21/523 Change-Id: Id157d10e2fe0b85f1be45035a6117358a42af028 (Cherry picked back to common/android-4.4) Signed-off-by: John Stultz --- fs/proc/base.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index ee1b021e153e..ac5d2aea63b5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2265,6 +2265,12 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, goto out; } + err = security_task_setscheduler(p); + if (err) { + count = err; + goto out; + } + task_lock(p); if (slack_ns == 0) p->timer_slack_ns = p->default_timer_slack_ns; @@ -2293,6 +2299,10 @@ static int timerslack_ns_show(struct seq_file *m, void *v) goto out; } + err = security_task_getscheduler(p); + if (err) + goto out; + task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); task_unlock(p); From 850117629453026394e13622203e2276891c3b8a Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Fri, 15 Jul 2016 12:39:13 +0200 Subject: [PATCH 0179/3220] BACKPORT: brcmfmac: defer DPC processing during probe The sdio dpc starts processing when in SDIOD_STATE_DATA. This state was entered right after firmware download. This patch moves that transition just before enabling sdio interrupt handling thus avoiding watchdog expiry which would put the bus to sleep while probing. Change-Id: I09c60752374b8145da78000935062be08c5c8a52 Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo Signed-off-by: Dmitry Shmidt --- drivers/net/wireless/brcm80211/brcmfmac/sdio.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c index 7e74ac3ad815..bcf29bf6f727 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c @@ -3401,10 +3401,6 @@ static int brcmf_sdio_download_firmware(struct brcmf_sdio *bus, goto err; } - /* Allow full data communication using DPC from now on. */ - brcmf_sdiod_change_state(bus->sdiodev, BRCMF_SDIOD_DATA); - bcmerror = 0; - err: brcmf_sdio_clkctl(bus, CLK_SDONLY, false); sdio_release_host(bus->sdiodev->func[1]); @@ -4112,6 +4108,9 @@ static void brcmf_sdio_firmware_callback(struct device *dev, } if (err == 0) { + /* Allow full data communication using DPC from now on. */ + brcmf_sdiod_change_state(bus->sdiodev, BRCMF_SDIOD_DATA); + err = brcmf_sdiod_intr_register(sdiodev); if (err != 0) brcmf_err("intr register failed:%d\n", err); From 1b06e2539281187910eb971202c7b2786fab5b57 Mon Sep 17 00:00:00 2001 From: Anson Jacob Date: Sun, 31 Jul 2016 22:30:14 -0400 Subject: [PATCH 0180/3220] usb: gadget: f_accessory: remove duplicate endpoint alloc usb_ep_autoconfig is called twice for allocating bulk out endpoint. Removed the unwanted call. Fixes Issue: 67180 Change-Id: I03e87a86fbbbc85831ff7f0496adf038d1de2956 Signed-off-by: Anson Jacob --- drivers/usb/gadget/function/f_accessory.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index c62123560143..2ca16a577542 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -531,15 +531,6 @@ static int create_bulk_endpoints(struct acc_dev *dev, ep->driver_data = dev; /* claim the endpoint */ dev->ep_out = ep; - ep = usb_ep_autoconfig(cdev->gadget, out_desc); - if (!ep) { - DBG(cdev, "usb_ep_autoconfig for ep_out failed\n"); - return -ENODEV; - } - DBG(cdev, "usb_ep_autoconfig for ep_out got %s\n", ep->name); - ep->driver_data = dev; /* claim the endpoint */ - dev->ep_out = ep; - /* now allocate requests for our endpoints */ for (i = 0; i < TX_REQ_MAX; i++) { req = acc_request_new(dev->ep_in, BULK_BUFFER_SIZE); From 818aa36ea868ba8f2985f9ca0906fd9cba3e437d Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Sun, 31 Jul 2016 17:07:46 +0530 Subject: [PATCH 0181/3220] Revert "panic: Add board ID to panic output" This reverts commit 4e09c510185cb4db2277ce81cce81b7aa06bea45. I checked for the usage of this debug helper in AOSP common kernels as well as vendor kernels (e.g exynos, msm, mediatek, omap, tegra, x86, x86_64) hosted at https://android.googlesource.com/kernel/ and I found out that other than few fairly obsolete Omap trees (for tuna & Glass) and Exynos tree (for Manta), there is no active user of this debug helper. So we can safely remove this helper code. Signed-off-by: Amit Pundir --- include/linux/kernel.h | 4 ---- kernel/panic.c | 8 -------- 2 files changed, 12 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 1eaf3d81f5c1..350dfb08aee3 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -830,8 +830,4 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } /* OTHER_WRITABLE? Generally considered a bad idea. */ \ BUILD_BUG_ON_ZERO((perms) & 2) + \ (perms)) - -/* To identify board information in panic logs, set this */ -extern char *mach_panic_string; - #endif diff --git a/kernel/panic.c b/kernel/panic.c index f07bfc9fe613..4b150bc0c6c1 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -28,9 +28,6 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 -/* Machine specific panic information string */ -char *mach_panic_string; - int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask; static int pause_on_oops; @@ -416,11 +413,6 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); - - if (mach_panic_string) - printk(KERN_WARNING "Board Information: %s\n", - mach_panic_string); - pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } From f3d9c312b89966b6d1ff9e4c503a7ae6d38e0176 Mon Sep 17 00:00:00 2001 From: James Carr Date: Fri, 29 Jul 2016 19:02:16 -0700 Subject: [PATCH 0182/3220] Implement memory_state_time, used by qcom,cpubw New driver memory_state_time tracks time spent in different DDR frequency and bandwidth states. Memory drivers such as qcom,cpubw can post updated state to the driver after registering a callback. Processed by a workqueue Bandwidth buckets are read in from device tree in the relevant qualcomm section, can be defined in any quantity and spacing. The data is exposed at /sys/kernel/memory_state_time, able to be read by the Android framework. Functionality is behind a config option CONFIG_MEMORY_STATE_TIME Change-Id: I4fee165571cb975fb9eacbc9aada5e6d7dd748f0 Signed-off-by: James Carr --- .../bindings/misc/memory-state-time.txt | 8 + android/configs/android-recommended.cfg | 1 + drivers/misc/Kconfig | 6 + drivers/misc/Makefile | 1 + drivers/misc/memory_state_time.c | 454 ++++++++++++++++++ include/linux/memory-state-time.h | 42 ++ 6 files changed, 512 insertions(+) create mode 100644 Documentation/devicetree/bindings/misc/memory-state-time.txt create mode 100644 drivers/misc/memory_state_time.c create mode 100644 include/linux/memory-state-time.h diff --git a/Documentation/devicetree/bindings/misc/memory-state-time.txt b/Documentation/devicetree/bindings/misc/memory-state-time.txt new file mode 100644 index 000000000000..c99a506c030d --- /dev/null +++ b/Documentation/devicetree/bindings/misc/memory-state-time.txt @@ -0,0 +1,8 @@ +Memory bandwidth and frequency state tracking + +Required properties: +- compatible : should be: + "memory-state-time" +- freq-tbl: Should contain entries with each frequency in Hz. +- bw-buckets: Should contain upper-bound limits for each bandwidth bucket in Mbps. + Must match the framework power_profile.xml for the device. diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 2f1ef077aa9e..3465a848d74d 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -119,6 +119,7 @@ CONFIG_TIMER_STATS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_UHID=y +CONFIG_MEMORY_STATE_TIME=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_HIDDEV=y diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 2763d6b3b063..598f96c26678 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -531,6 +531,12 @@ config UID_CPUTIME help Per UID based cpu time statistics exported to /proc/uid_cputime +config MEMORY_STATE_TIME + tristate "Memory freq/bandwidth time statistics" + depends on PROFILING + help + Memory time statistics exported to /sys/kernel/memory_state_time + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index e5142b836aee..b76b4c9fe104 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -57,3 +57,4 @@ obj-$(CONFIG_ECHO) += echo/ obj-$(CONFIG_VEXPRESS_SYSCFG) += vexpress-syscfg.o obj-$(CONFIG_CXL_BASE) += cxl/ obj-$(CONFIG_UID_CPUTIME) += uid_cputime.o +obj-$(CONFIG_MEMORY_STATE_TIME) += memory_state_time.o diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c new file mode 100644 index 000000000000..34c797a06a31 --- /dev/null +++ b/drivers/misc/memory_state_time.c @@ -0,0 +1,454 @@ +/* drivers/misc/memory_state_time.c + * + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KERNEL_ATTR_RO(_name) \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define KERNEL_ATTR_RW(_name) \ +static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +#define FREQ_HASH_BITS 4 +DECLARE_HASHTABLE(freq_hash_table, FREQ_HASH_BITS); + +static DEFINE_MUTEX(mem_lock); + +#define TAG "memory_state_time" +#define BW_NODE "/soc/memory-state-time" +#define FREQ_TBL "freq-tbl" +#define BW_TBL "bw-buckets" +#define NUM_SOURCES "num-sources" + +#define LOWEST_FREQ 2 + +static int curr_bw; +static int curr_freq; +static u32 *bw_buckets; +static u32 *freq_buckets; +static int num_freqs; +static int num_buckets; +static int registered_bw_sources; +static u64 last_update; +static bool init_success; +static struct workqueue_struct *memory_wq; +static u32 num_sources = 10; +static int *bandwidths; + +struct freq_entry { + int freq; + u64 *buckets; /* Bandwidth buckets. */ + struct hlist_node hash; +}; + +struct queue_container { + struct work_struct update_state; + int value; + u64 time_now; + int id; + struct mutex *lock; +}; + +static int find_bucket(int bw) +{ + int i; + + if (bw_buckets != NULL) { + for (i = 0; i < num_buckets; i++) { + if (bw_buckets[i] > bw) { + pr_debug("Found bucket %d for bandwidth %d\n", + i, bw); + return i; + } + } + return num_buckets - 1; + } + return 0; +} + +static u64 get_time_diff(u64 time_now) +{ + u64 ms; + + ms = time_now - last_update; + last_update = time_now; + return ms; +} + +static ssize_t show_stat_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, j; + int len = 0; + struct freq_entry *freq_entry; + + for (i = 0; i < num_freqs; i++) { + hash_for_each_possible(freq_hash_table, freq_entry, hash, + freq_buckets[i]) { + if (freq_entry->freq == freq_buckets[i]) { + len += scnprintf(buf + len, PAGE_SIZE - len, + "%d ", freq_buckets[i]); + if (len >= PAGE_SIZE) + break; + for (j = 0; j < num_buckets; j++) { + len += scnprintf(buf + len, + PAGE_SIZE - len, + "%llu ", + freq_entry->buckets[j]); + } + len += scnprintf(buf + len, PAGE_SIZE - len, + "\n"); + } + } + } + pr_debug("Current Time: %llu\n", ktime_get_boot_ns()); + return len; +} +KERNEL_ATTR_RO(show_stat); + +static void update_table(u64 time_now) +{ + struct freq_entry *freq_entry; + + pr_debug("Last known bw %d freq %d\n", curr_bw, curr_freq); + hash_for_each_possible(freq_hash_table, freq_entry, hash, curr_freq) { + if (curr_freq == freq_entry->freq) { + freq_entry->buckets[find_bucket(curr_bw)] + += get_time_diff(time_now); + break; + } + } +} + +static bool freq_exists(int freq) +{ + int i; + + for (i = 0; i < num_freqs; i++) { + if (freq == freq_buckets[i]) + return true; + } + return false; +} + +static int calculate_total_bw(int bw, int index) +{ + int i; + int total_bw = 0; + + pr_debug("memory_state_time New bw %d for id %d\n", bw, index); + bandwidths[index] = bw; + for (i = 0; i < registered_bw_sources; i++) + total_bw += bandwidths[i]; + return total_bw; +} + +static void freq_update_do_work(struct work_struct *work) +{ + struct queue_container *freq_state_update + = container_of(work, struct queue_container, + update_state); + if (freq_state_update) { + mutex_lock(&mem_lock); + update_table(freq_state_update->time_now); + curr_freq = freq_state_update->value; + mutex_unlock(&mem_lock); + kfree(freq_state_update); + } +} + +static void bw_update_do_work(struct work_struct *work) +{ + struct queue_container *bw_state_update + = container_of(work, struct queue_container, + update_state); + if (bw_state_update) { + mutex_lock(&mem_lock); + update_table(bw_state_update->time_now); + curr_bw = calculate_total_bw(bw_state_update->value, + bw_state_update->id); + mutex_unlock(&mem_lock); + kfree(bw_state_update); + } +} + +static void memory_state_freq_update(struct memory_state_update_block *ub, + int value) +{ + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + if (freq_exists(value) && init_success) { + struct queue_container *freq_container + = kmalloc(sizeof(struct queue_container), + GFP_KERNEL); + if (!freq_container) + return; + INIT_WORK(&freq_container->update_state, + freq_update_do_work); + freq_container->time_now = ktime_get_boot_ns(); + freq_container->value = value; + pr_debug("Scheduling freq update in work queue\n"); + queue_work(memory_wq, &freq_container->update_state); + } else { + pr_debug("Freq does not exist.\n"); + } + } +} + +static void memory_state_bw_update(struct memory_state_update_block *ub, + int value) +{ + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + if (init_success) { + struct queue_container *bw_container + = kmalloc(sizeof(struct queue_container), + GFP_KERNEL); + if (!bw_container) + return; + INIT_WORK(&bw_container->update_state, + bw_update_do_work); + bw_container->time_now = ktime_get_boot_ns(); + bw_container->value = value; + bw_container->id = ub->id; + pr_debug("Scheduling bandwidth update in work queue\n"); + queue_work(memory_wq, &bw_container->update_state); + } + } +} + +struct memory_state_update_block *memory_state_register_frequency_source(void) +{ + struct memory_state_update_block *block; + + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + pr_debug("Allocating frequency source\n"); + block = kmalloc(sizeof(struct memory_state_update_block), + GFP_KERNEL); + if (!block) + return NULL; + block->update_call = memory_state_freq_update; + return block; + } + pr_err("Config option disabled.\n"); + return NULL; +} +EXPORT_SYMBOL_GPL(memory_state_register_frequency_source); + +struct memory_state_update_block *memory_state_register_bandwidth_source(void) +{ + struct memory_state_update_block *block; + + if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) { + pr_debug("Allocating bandwidth source %d\n", + registered_bw_sources); + block = kmalloc(sizeof(struct memory_state_update_block), + GFP_KERNEL); + if (!block) + return NULL; + block->update_call = memory_state_bw_update; + if (registered_bw_sources < num_sources) { + block->id = registered_bw_sources++; + } else { + pr_err("Unable to allocate source; max number reached\n"); + kfree(block); + return NULL; + } + return block; + } + pr_err("Config option disabled.\n"); + return NULL; +} +EXPORT_SYMBOL_GPL(memory_state_register_bandwidth_source); + +/* Buckets are designated by their maximum. + * Returns the buckets decided by the capability of the device. + */ +static int get_bw_buckets(struct device *dev) +{ + int ret, lenb; + struct device_node *node = dev->of_node; + + of_property_read_u32(node, NUM_SOURCES, &num_sources); + if (of_find_property(node, BW_TBL, &lenb)) { + bandwidths = devm_kzalloc(dev, + sizeof(*bandwidths) * num_sources, GFP_KERNEL); + if (!bandwidths) + return -ENOMEM; + lenb /= sizeof(*bw_buckets); + bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets), + GFP_KERNEL); + if (!bw_buckets) { + devm_kfree(dev, bandwidths); + return -ENOMEM; + } + ret = of_property_read_u32_array(node, BW_TBL, bw_buckets, + lenb); + if (ret < 0) { + devm_kfree(dev, bandwidths); + devm_kfree(dev, bw_buckets); + pr_err("Unable to read bandwidth table from device tree.\n"); + return ret; + } + } + curr_bw = 0; + num_buckets = lenb; + return 0; +} + +/* Adds struct freq_entry nodes to the hashtable for each compatible frequency. + * Returns the supported number of frequencies. + */ +static int freq_buckets_init(struct device *dev) +{ + struct freq_entry *freq_entry; + int i; + int ret, lenf; + struct device_node *node = dev->of_node; + + if (of_find_property(node, FREQ_TBL, &lenf)) { + lenf /= sizeof(*freq_buckets); + freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets), + GFP_KERNEL); + if (!freq_buckets) + return -ENOMEM; + pr_debug("freqs found len %d\n", lenf); + ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets, + lenf); + if (ret < 0) { + devm_kfree(dev, freq_buckets); + pr_err("Unable to read frequency table from device tree.\n"); + return ret; + } + pr_debug("ret freq %d\n", ret); + } + num_freqs = lenf; + curr_freq = freq_buckets[LOWEST_FREQ]; + + for (i = 0; i < num_freqs; i++) { + freq_entry = devm_kzalloc(dev, sizeof(struct freq_entry), + GFP_KERNEL); + if (!freq_entry) + return -ENOMEM; + freq_entry->buckets = devm_kzalloc(dev, sizeof(u64)*num_buckets, + GFP_KERNEL); + if (!freq_entry->buckets) { + devm_kfree(dev, freq_entry); + return -ENOMEM; + } + pr_debug("memory_state_time Adding freq to ht %d\n", + freq_buckets[i]); + freq_entry->freq = freq_buckets[i]; + hash_add(freq_hash_table, &freq_entry->hash, freq_buckets[i]); + } + return 0; +} + +struct kobject *memory_kobj; +EXPORT_SYMBOL_GPL(memory_kobj); + +static struct attribute *memory_attrs[] = { + &show_stat_attr.attr, + NULL +}; + +static struct attribute_group memory_attr_group = { + .attrs = memory_attrs, +}; + +static int memory_state_time_probe(struct platform_device *pdev) +{ + int error; + + error = get_bw_buckets(&pdev->dev); + if (error) + return error; + error = freq_buckets_init(&pdev->dev); + if (error) + return error; + last_update = ktime_get_boot_ns(); + init_success = true; + + pr_debug("memory_state_time initialized with num_freqs %d\n", + num_freqs); + return 0; +} + +static const struct of_device_id match_table[] = { + { .compatible = "memory-state-time" }, + {} +}; + +static struct platform_driver memory_state_time_driver = { + .probe = memory_state_time_probe, + .driver = { + .name = "memory-state-time", + .of_match_table = match_table, + .owner = THIS_MODULE, + }, +}; + +static int __init memory_state_time_init(void) +{ + int error; + + hash_init(freq_hash_table); + memory_wq = create_singlethread_workqueue("memory_wq"); + if (!memory_wq) { + pr_err("Unable to create workqueue.\n"); + return -EINVAL; + } + /* + * Create sys/kernel directory for memory_state_time. + */ + memory_kobj = kobject_create_and_add(TAG, kernel_kobj); + if (!memory_kobj) { + pr_err("Unable to allocate memory_kobj for sysfs directory.\n"); + error = -ENOMEM; + goto wq; + } + error = sysfs_create_group(memory_kobj, &memory_attr_group); + if (error) { + pr_err("Unable to create sysfs folder.\n"); + goto kobj; + } + + error = platform_driver_register(&memory_state_time_driver); + if (error) { + pr_err("Unable to register memory_state_time platform driver.\n"); + goto group; + } + return 0; + +group: sysfs_remove_group(memory_kobj, &memory_attr_group); +kobj: kobject_put(memory_kobj); +wq: destroy_workqueue(memory_wq); + return error; +} +module_init(memory_state_time_init); diff --git a/include/linux/memory-state-time.h b/include/linux/memory-state-time.h new file mode 100644 index 000000000000..d2212b027866 --- /dev/null +++ b/include/linux/memory-state-time.h @@ -0,0 +1,42 @@ +/* include/linux/memory-state-time.h + * + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include + +#define UPDATE_MEMORY_STATE(BLOCK, VALUE) BLOCK->update_call(BLOCK, VALUE) + +struct memory_state_update_block; + +typedef void (*memory_state_update_fn_t)(struct memory_state_update_block *ub, + int value); + +/* This struct is populated when you pass it to a memory_state_register* + * function. The update_call function is used for an update and defined in the + * typedef memory_state_update_fn_t + */ +struct memory_state_update_block { + memory_state_update_fn_t update_call; + int id; +}; + +/* Register a frequency struct memory_state_update_block to provide updates to + * memory_state_time about frequency changes using its update_call function. + */ +struct memory_state_update_block *memory_state_register_frequency_source(void); + +/* Register a bandwidth struct memory_state_update_block to provide updates to + * memory_state_time about bandwidth changes using its update_call function. + */ +struct memory_state_update_block *memory_state_register_bandwidth_source(void); From ee247d4205fdee14675c4068c7fbb2c8118ba4ce Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 14 Mar 2016 09:56:35 -0300 Subject: [PATCH 0183/3220] UPSTREAM: net: Fix use after free in the recvmmsg exit path (cherry picked from commit 34b88a68f26a75e4fded796f1a49c40f82234b7d) The syzkaller fuzzer hit the following use-after-free: Call Trace: [] __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:295 [] __sys_recvmmsg+0x6fa/0x7f0 net/socket.c:2261 [< inline >] SYSC_recvmmsg net/socket.c:2281 [] SyS_recvmmsg+0x16f/0x180 net/socket.c:2270 [] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 And, as Dmitry rightly assessed, that is because we can drop the reference and then touch it when the underlying recvmsg calls return some packets and then hit an error, which will make recvmmsg to set sock->sk->sk_err, oops, fix it. Reported-and-Tested-by: Dmitry Vyukov Cc: Alexander Potapenko Cc: Eric Dumazet Cc: Kostya Serebryany Cc: Sasha Levin Fixes: a2e2725541fa ("net: Introduce recvmmsg socket syscall") http://lkml.kernel.org/r/20160122211644.GC2470@redhat.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Change-Id: I2adb0faf595b7b634d9b739dfdd1a47109e20ecb Bug: 30515201 --- net/socket.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/net/socket.c b/net/socket.c index d730ef9dfbf0..263b334ec5e4 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2238,31 +2238,31 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, break; } + if (err == 0) + goto out_put; + + if (datagrams == 0) { + datagrams = err; + goto out_put; + } + + /* + * We may return less entries than requested (vlen) if the + * sock is non block and there aren't enough datagrams... + */ + if (err != -EAGAIN) { + /* + * ... or if recvmsg returns an error after we + * received some datagrams, where we record the + * error to return on the next call or if the + * app asks about it using getsockopt(SO_ERROR). + */ + sock->sk->sk_err = -err; + } out_put: fput_light(sock->file, fput_needed); - if (err == 0) - return datagrams; - - if (datagrams != 0) { - /* - * We may return less entries than requested (vlen) if the - * sock is non block and there aren't enough datagrams... - */ - if (err != -EAGAIN) { - /* - * ... or if recvmsg returns an error after we - * received some datagrams, where we record the - * error to return on the next call or if the - * app asks about it using getsockopt(SO_ERROR). - */ - sock->sk->sk_err = -err; - } - - return datagrams; - } - - return err; + return datagrams; } SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, From 96b0434c25be8c73822a679be35b29ca5263baea Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Wed, 9 Jun 2010 17:47:38 -0500 Subject: [PATCH 0184/3220] CHROMIUM: dm: boot time specification of dm= This is a wrap-up of three patches pending upstream approval. I'm bundling them because they are interdependent, and it'll be easier to drop it on rebase later. 1. dm: allow a dm-fs-style device to be shared via dm-ioctl Integrates feedback from Alisdair, Mike, and Kiyoshi. Two main changes occur here: - One function is added which allows for a programmatically created mapped device to be inserted into the dm-ioctl hash table. This binds the device to a name and, optional, uuid which is needed by udev and allows for userspace management of the mapped device. - dm_table_complete() was extended to handle all of the final functional changes required for the table to be operational once called. 2. init: boot to device-mapper targets without an initr* Add a dm= kernel parameter modeled after the md= parameter from do_mounts_md. It allows for device-mapper targets to be configured at boot time for use early in the boot process (as the root device or otherwise). It also replaces /dev/XXX calls with major:minor opportunistically. The format is dm="name uuid ro,table line 1,table line 2,...". The parser expects the comma to be safe to use as a newline substitute but, otherwise, uses the normal separator of space. Some attempt has been made to make it forgiving of additional spaces (using skip_spaces()). A mapped device created during boot will be assigned a minor of 0 and may be access via /dev/dm-0. An example dm-linear root with no uuid may look like: root=/dev/dm-0 dm="lroot none ro, 0 4096 linear /dev/ubdb 0, 4096 4096 linear /dv/ubdc 0" Once udev is started, /dev/dm-0 will become /dev/mapper/lroot. Older upstream threads: http://marc.info/?l=dm-devel&m=127429492521964&w=2 http://marc.info/?l=dm-devel&m=127429499422096&w=2 http://marc.info/?l=dm-devel&m=127429493922000&w=2 Latest upstream threads: https://patchwork.kernel.org/patch/104859/ https://patchwork.kernel.org/patch/104860/ https://patchwork.kernel.org/patch/104861/ Bug: 27175947 Signed-off-by: Will Drewry Review URL: http://codereview.chromium.org/2020011 Change-Id: I92bd53432a11241228d2e5ac89a3b20d19b05a31 --- Documentation/device-mapper/boot.txt | 42 +++ Documentation/kernel-parameters.txt | 6 + drivers/md/dm-ioctl.c | 39 +++ drivers/md/dm-table.c | 1 + include/linux/device-mapper.h | 6 + init/Makefile | 1 + init/do_mounts.c | 1 + init/do_mounts.h | 10 + init/do_mounts_dm.c | 410 +++++++++++++++++++++++++++ 9 files changed, 516 insertions(+) create mode 100644 Documentation/device-mapper/boot.txt create mode 100644 init/do_mounts_dm.c diff --git a/Documentation/device-mapper/boot.txt b/Documentation/device-mapper/boot.txt new file mode 100644 index 000000000000..adcaad5e5e32 --- /dev/null +++ b/Documentation/device-mapper/boot.txt @@ -0,0 +1,42 @@ +Boot time creation of mapped devices +=================================== + +It is possible to configure a device mapper device to act as the root +device for your system in two ways. + +The first is to build an initial ramdisk which boots to a minimal +userspace which configures the device, then pivot_root(8) in to it. + +For simple device mapper configurations, it is possible to boot directly +using the following kernel command line: + +dm=" ,table line 1,...,table line n" + +name = the name to associate with the device + after boot, udev, if used, will use that name to label + the device node. +uuid = may be 'none' or the UUID desired for the device. +ro = may be "ro" or "rw". If "ro", the device and device table will be + marked read-only. + +Each table line may be as normal when using the dmsetup tool except for +two variations: +1. Any use of commas will be interpreted as a newline +2. Quotation marks cannot be escaped and cannot be used without + terminating the dm= argument. + +Unless renamed by udev, the device node created will be dm-0 as the +first minor number for the device-mapper is used during early creation. + +Example +======= + +- Booting to a linear array made up of user-mode linux block devices: + + dm="lroot none 0, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" \ + root=/dev/dm-0 + +Will boot to a rw dm-linear target of 8192 sectors split across two +block devices identified by their major:minor numbers. After boot, udev +will rename this target to /dev/mapper/lroot (depending on the rules). +No uuid was assigned. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 742f69d18fc8..b02f027cba72 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -56,6 +56,7 @@ parameter is applicable: BLACKFIN Blackfin architecture is enabled. CLK Common clock infrastructure is enabled. CMA Contiguous Memory Area support is enabled. + DM Device mapper support is enabled. DRM Direct Rendering Management support is enabled. DYNAMIC_DEBUG Build in debug messages and enable them at runtime EDD BIOS Enhanced Disk Drive Services (EDD) is enabled @@ -915,6 +916,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. dis_ucode_ldr [X86] Disable the microcode loader. + dm= [DM] Allows early creation of a device-mapper device. + See Documentation/device-mapper/boot.txt. + + dmasound= [HW,OSS] Sound subsystem buff + dma_debug=off If the kernel is compiled with DMA_API_DEBUG support, this option disables the debugging code at boot. diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 80a439543259..bc5e9a5b1f30 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1923,6 +1923,45 @@ void dm_interface_exit(void) dm_hash_exit(); } + +/** + * dm_ioctl_export - Permanently export a mapped device via the ioctl interface + * @md: Pointer to mapped_device + * @name: Buffer (size DM_NAME_LEN) for name + * @uuid: Buffer (size DM_UUID_LEN) for uuid or NULL if not desired + */ +int dm_ioctl_export(struct mapped_device *md, const char *name, + const char *uuid) +{ + int r = 0; + struct hash_cell *hc; + + if (!md) { + r = -ENXIO; + goto out; + } + + /* The name and uuid can only be set once. */ + mutex_lock(&dm_hash_cells_mutex); + hc = dm_get_mdptr(md); + mutex_unlock(&dm_hash_cells_mutex); + if (hc) { + DMERR("%s: already exported", dm_device_name(md)); + r = -ENXIO; + goto out; + } + + r = dm_hash_insert(name, uuid, md); + if (r) { + DMERR("%s: could not bind to '%s'", dm_device_name(md), name); + goto out; + } + + /* Let udev know we've changed. */ + dm_kobject_uevent(md, KOBJ_CHANGE, dm_get_event_nr(md)); +out: + return r; +} /** * dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers * @md: Pointer to mapped_device diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 061152a43730..80ede7a70761 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index ec1c61c87d89..87afa0552398 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -380,6 +380,12 @@ void dm_put(struct mapped_device *md); void dm_set_mdptr(struct mapped_device *md, void *ptr); void *dm_get_mdptr(struct mapped_device *md); +/* + * Export the device via the ioctl interface (uses mdptr). + */ +int dm_ioctl_export(struct mapped_device *md, const char *name, + const char *uuid); + /* * A device can still be used while suspended, but I/O is deferred. */ diff --git a/init/Makefile b/init/Makefile index 692b91f1c1d4..243f61de2cba 100644 --- a/init/Makefile +++ b/init/Makefile @@ -15,6 +15,7 @@ mounts-y := do_mounts.o mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o +mounts-$(CONFIG_BLK_DEV_DM) += do_mounts_dm.o # dependencies on generated files need to be listed explicitly $(obj)/version.o: include/generated/compile.h diff --git a/init/do_mounts.c b/init/do_mounts.c index dea5de95c2dd..1902a1c80831 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -566,6 +566,7 @@ void __init prepare_namespace(void) wait_for_device_probe(); md_run_setup(); + dm_run_setup(); if (saved_root_name[0]) { root_device_name = saved_root_name; diff --git a/init/do_mounts.h b/init/do_mounts.h index f5b978a9bb92..09d22862e8c3 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -74,3 +74,13 @@ void md_run_setup(void); static inline void md_run_setup(void) {} #endif + +#ifdef CONFIG_BLK_DEV_DM + +void dm_run_setup(void); + +#else + +static inline void dm_run_setup(void) {} + +#endif diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c new file mode 100644 index 000000000000..0fd3411533f3 --- /dev/null +++ b/init/do_mounts_dm.c @@ -0,0 +1,410 @@ +/* do_mounts_dm.c + * Copyright (C) 2010 The Chromium OS Authors + * All Rights Reserved. + * Based on do_mounts_md.c + * + * This file is released under the GPL. + */ +#include +#include +#include + +#include "do_mounts.h" + +#define DM_MAX_NAME 32 +#define DM_MAX_UUID 129 +#define DM_NO_UUID "none" + +#define DM_MSG_PREFIX "init" + +/* Separators used for parsing the dm= argument. */ +#define DM_FIELD_SEP ' ' +#define DM_LINE_SEP ',' + +/* + * When the device-mapper and any targets are compiled into the kernel + * (not a module), one target may be created and used as the root device at + * boot time with the parameters given with the boot line dm=... + * The code for that is here. + */ + +struct dm_setup_target { + sector_t begin; + sector_t length; + char *type; + char *params; + /* simple singly linked list */ + struct dm_setup_target *next; +}; + +static struct { + int minor; + int ro; + char name[DM_MAX_NAME]; + char uuid[DM_MAX_UUID]; + char *targets; + struct dm_setup_target *target; + int target_count; +} dm_setup_args __initdata; + +static __initdata int dm_early_setup; + +static size_t __init get_dm_option(char *str, char **next, char sep) +{ + size_t len = 0; + char *endp = NULL; + + if (!str) + return 0; + + endp = strchr(str, sep); + if (!endp) { /* act like strchrnul */ + len = strlen(str); + endp = str + len; + } else { + len = endp - str; + } + + if (endp == str) + return 0; + + if (!next) + return len; + + if (*endp == 0) { + /* Don't advance past the nul. */ + *next = endp; + } else { + *next = endp + 1; + } + return len; +} + +static int __init dm_setup_args_init(void) +{ + dm_setup_args.minor = 0; + dm_setup_args.ro = 0; + dm_setup_args.target = NULL; + dm_setup_args.target_count = 0; + return 0; +} + +static int __init dm_setup_cleanup(void) +{ + struct dm_setup_target *target = dm_setup_args.target; + struct dm_setup_target *old_target = NULL; + while (target) { + kfree(target->type); + kfree(target->params); + old_target = target; + target = target->next; + kfree(old_target); + dm_setup_args.target_count--; + } + BUG_ON(dm_setup_args.target_count); + return 0; +} + +static char * __init dm_setup_parse_device_args(char *str) +{ + char *next = NULL; + size_t len = 0; + + /* Grab the logical name of the device to be exported to udev */ + len = get_dm_option(str, &next, DM_FIELD_SEP); + if (!len) { + DMERR("failed to parse device name"); + goto parse_fail; + } + len = min(len + 1, sizeof(dm_setup_args.name)); + strlcpy(dm_setup_args.name, str, len); /* includes nul */ + str = skip_spaces(next); + + /* Grab the UUID value or "none" */ + len = get_dm_option(str, &next, DM_FIELD_SEP); + if (!len) { + DMERR("failed to parse device uuid"); + goto parse_fail; + } + len = min(len + 1, sizeof(dm_setup_args.uuid)); + strlcpy(dm_setup_args.uuid, str, len); + str = skip_spaces(next); + + /* Determine if the table/device will be read only or read-write */ + if (!strncmp("ro,", str, 3)) { + dm_setup_args.ro = 1; + } else if (!strncmp("rw,", str, 3)) { + dm_setup_args.ro = 0; + } else { + DMERR("failed to parse table mode"); + goto parse_fail; + } + str = skip_spaces(str + 3); + + return str; + +parse_fail: + return NULL; +} + +static void __init dm_substitute_devices(char *str, size_t str_len) +{ + char *candidate = str; + char *candidate_end = str; + char old_char; + size_t len = 0; + dev_t dev; + + if (str_len < 3) + return; + + while (str && *str) { + candidate = strchr(str, '/'); + if (!candidate) + break; + + /* Avoid embedded slashes */ + if (candidate != str && *(candidate - 1) != DM_FIELD_SEP) { + str = strchr(candidate, DM_FIELD_SEP); + continue; + } + + len = get_dm_option(candidate, &candidate_end, DM_FIELD_SEP); + str = skip_spaces(candidate_end); + if (len < 3 || len > 37) /* name_to_dev_t max; maj:mix min */ + continue; + + /* Temporarily terminate with a nul */ + candidate_end--; + old_char = *candidate_end; + *candidate_end = '\0'; + + DMDEBUG("converting candidate device '%s' to dev_t", candidate); + /* Use the boot-time specific device naming */ + dev = name_to_dev_t(candidate); + *candidate_end = old_char; + + DMDEBUG(" -> %u", dev); + /* No suitable replacement found */ + if (!dev) + continue; + + /* Rewrite the /dev/path as a major:minor */ + len = snprintf(candidate, len, "%u:%u", MAJOR(dev), MINOR(dev)); + if (!len) { + DMERR("error substituting device major/minor."); + break; + } + candidate += len; + /* Pad out with spaces (fixing our nul) */ + while (candidate < candidate_end) + *(candidate++) = DM_FIELD_SEP; + } +} + +static int __init dm_setup_parse_targets(char *str) +{ + char *next = NULL; + size_t len = 0; + struct dm_setup_target **target = NULL; + + /* Targets are defined as per the table format but with a + * comma as a newline separator. */ + target = &dm_setup_args.target; + while (str && *str) { + *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL); + if (!*target) { + DMERR("failed to allocate memory for target %d", + dm_setup_args.target_count); + goto parse_fail; + } + dm_setup_args.target_count++; + + (*target)->begin = simple_strtoull(str, &next, 10); + if (!next || *next != DM_FIELD_SEP) { + DMERR("failed to parse starting sector for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next + 1); + + (*target)->length = simple_strtoull(str, &next, 10); + if (!next || *next != DM_FIELD_SEP) { + DMERR("failed to parse length for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next + 1); + + len = get_dm_option(str, &next, DM_FIELD_SEP); + if (!len || + !((*target)->type = kstrndup(str, len, GFP_KERNEL))) { + DMERR("failed to parse type for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next); + + len = get_dm_option(str, &next, DM_LINE_SEP); + if (!len || + !((*target)->params = kstrndup(str, len, GFP_KERNEL))) { + DMERR("failed to parse params for target %d", + dm_setup_args.target_count - 1); + goto parse_fail; + } + str = skip_spaces(next); + + /* Before moving on, walk through the copied target and + * attempt to replace all /dev/xxx with the major:minor number. + * It may not be possible to resolve them traditionally at + * boot-time. */ + dm_substitute_devices((*target)->params, len); + + target = &((*target)->next); + } + DMDEBUG("parsed %d targets", dm_setup_args.target_count); + + return 0; + +parse_fail: + return 1; +} + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the DM device now; that is handled by + * dm_setup_drive after the low-level disk drivers have initialised. + * dm format is as follows: + * dm="name uuid fmode,[table line 1],[table line 2],..." + * May be used with root=/dev/dm-0 as it always uses the first dm minor. + */ + +static int __init dm_setup(char *str) +{ + dm_setup_args_init(); + + str = dm_setup_parse_device_args(str); + if (!str) { + DMDEBUG("str is NULL"); + goto parse_fail; + } + + /* Target parsing is delayed until we have dynamic memory */ + dm_setup_args.targets = str; + + printk(KERN_INFO "dm: will configure '%s' on dm-%d\n", + dm_setup_args.name, dm_setup_args.minor); + + dm_early_setup = 1; + return 1; + +parse_fail: + printk(KERN_WARNING "dm: Invalid arguments supplied to dm=.\n"); + return 0; +} + + +static void __init dm_setup_drive(void) +{ + struct mapped_device *md = NULL; + struct dm_table *table = NULL; + struct dm_setup_target *target; + char *uuid = dm_setup_args.uuid; + fmode_t fmode = FMODE_READ; + + /* Finish parsing the targets. */ + if (dm_setup_parse_targets(dm_setup_args.targets)) + goto parse_fail; + + if (dm_create(dm_setup_args.minor, &md)) { + DMDEBUG("failed to create the device"); + goto dm_create_fail; + } + DMDEBUG("created device '%s'", dm_device_name(md)); + + /* In addition to flagging the table below, the disk must be + * set explicitly ro/rw. */ + set_disk_ro(dm_disk(md), dm_setup_args.ro); + + if (!dm_setup_args.ro) + fmode |= FMODE_WRITE; + if (dm_table_create(&table, fmode, dm_setup_args.target_count, md)) { + DMDEBUG("failed to create the table"); + goto dm_table_create_fail; + } + + target = dm_setup_args.target; + while (target) { + DMINFO("adding target '%llu %llu %s %s'", + (unsigned long long) target->begin, + (unsigned long long) target->length, target->type, + target->params); + if (dm_table_add_target(table, target->type, target->begin, + target->length, target->params)) { + DMDEBUG("failed to add the target to the table"); + goto add_target_fail; + } + target = target->next; + } + + if (dm_table_complete(table)) { + DMDEBUG("failed to complete the table"); + goto table_complete_fail; + } + + /* Suspend the device so that we can bind it to the table. */ + if (dm_suspend(md, 0)) { + DMDEBUG("failed to suspend the device pre-bind"); + goto suspend_fail; + } + + /* Bind the table to the device. This is the only way to associate + * md->map with the table and set the disk capacity directly. */ + if (dm_swap_table(md, table)) { /* should return NULL. */ + DMDEBUG("failed to bind the device to the table"); + goto table_bind_fail; + } + + /* Finally, resume and the device should be ready. */ + if (dm_resume(md)) { + DMDEBUG("failed to resume the device"); + goto resume_fail; + } + + /* Export the dm device via the ioctl interface */ + if (!strcmp(DM_NO_UUID, dm_setup_args.uuid)) + uuid = NULL; + if (dm_ioctl_export(md, dm_setup_args.name, uuid)) { + DMDEBUG("failed to export device with given name and uuid"); + goto export_fail; + } + printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor); + + dm_setup_cleanup(); + return; + +export_fail: +resume_fail: +table_bind_fail: +suspend_fail: +table_complete_fail: +add_target_fail: + dm_table_put(table); +dm_table_create_fail: + dm_put(md); +dm_create_fail: + dm_setup_cleanup(); +parse_fail: + printk(KERN_WARNING "dm: starting dm-%d (%s) failed\n", + dm_setup_args.minor, dm_setup_args.name); +} + +__setup("dm=", dm_setup); + +void __init dm_run_setup(void) +{ + if (!dm_early_setup) + return; + printk(KERN_INFO "dm: attempting early device configuration.\n"); + dm_setup_drive(); +} From 5a77db78398a7247ab6bb6ff9720bae19d1f9ff4 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 8 Feb 2016 16:47:41 -0800 Subject: [PATCH 0185/3220] ANDROID: dm: Rebase on top of 4.1 1. "dm: optimize use SRCU and RCU" removes the use of dm_table_put. 2. "dm: remove request-based logic from make_request_fn wrapper" necessitates calling dm_setup_md_queue or else the request_queue's make_request_fn pointer ends being unset. [ 7.711600] Internal error: Oops - bad mode: 0 [#1] PREEMPT SMP [ 7.717519] CPU: 1 PID: 1 Comm: swapper/0 Tainted: G W 4.1.15-02273-gb057d16-dirty #33 [ 7.726559] Hardware name: HiKey Development Board (DT) [ 7.731779] task: ffffffc005f8acc0 ti: ffffffc005f8c000 task.ti: ffffffc005f8c000 [ 7.739257] PC is at 0x0 [ 7.741787] LR is at generic_make_request+0x8c/0x108 .... [ 9.082931] Call trace: [ 9.085372] [< (null)>] (null) [ 9.090074] [] submit_bio+0x98/0x1e0 [ 9.095212] [] _submit_bh+0x120/0x1f0 [ 9.096165] cfg80211: Calling CRDA to update world regulatory domain [ 9.106781] [] __bread_gfp+0x94/0x114 [ 9.112004] [] ext4_fill_super+0x18c/0x2d64 [ 9.117750] [] mount_bdev+0x194/0x1c0 [ 9.122973] [] ext4_mount+0x14/0x1c [ 9.128021] [] mount_fs+0x3c/0x194 [ 9.132985] [] vfs_kern_mount+0x4c/0x134 [ 9.138467] [] do_mount+0x204/0xbbc [ 9.143514] [] SyS_mount+0x94/0xe8 [ 9.148479] [] mount_block_root+0x120/0x24c [ 9.154222] [] mount_root+0x110/0x12c [ 9.159443] [] prepare_namespace+0x170/0x1b8 [ 9.165273] [] kernel_init_freeable+0x23c/0x260 [ 9.171365] [] kernel_init+0x10/0x118 [ 9.176589] Code: bad PC value [ 9.179807] ---[ end trace 75e1bc52ba364d13 ]--- Bug: 27175947 Signed-off-by: Badhri Jagan Sridharan Change-Id: I952d86fd1475f0825f9be1386e3497b36127abd0 --- init/do_mounts_dm.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c index 0fd3411533f3..f521bc5ae248 100644 --- a/init/do_mounts_dm.c +++ b/init/do_mounts_dm.c @@ -10,6 +10,7 @@ #include #include "do_mounts.h" +#include "../drivers/md/dm.h" #define DM_MAX_NAME 32 #define DM_MAX_UUID 129 @@ -333,6 +334,7 @@ static void __init dm_setup_drive(void) goto dm_table_create_fail; } + dm_lock_md_type(md); target = dm_setup_args.target; while (target) { DMINFO("adding target '%llu %llu %s %s'", @@ -352,6 +354,17 @@ static void __init dm_setup_drive(void) goto table_complete_fail; } + if (dm_get_md_type(md) == DM_TYPE_NONE) { + dm_set_md_type(md, dm_table_get_type(table)); + if (dm_setup_md_queue(md)) { + DMWARN("unable to set up device queue for new table."); + goto setup_md_queue_fail; + } + } else if (dm_get_md_type(md) != dm_table_get_type(table)) { + DMWARN("can't change device type after initial table load."); + goto setup_md_queue_fail; + } + /* Suspend the device so that we can bind it to the table. */ if (dm_suspend(md, 0)) { DMDEBUG("failed to suspend the device pre-bind"); @@ -380,6 +393,7 @@ static void __init dm_setup_drive(void) } printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor); + dm_unlock_md_type(md); dm_setup_cleanup(); return; @@ -387,9 +401,10 @@ export_fail: resume_fail: table_bind_fail: suspend_fail: +setup_md_queue_fail: table_complete_fail: add_target_fail: - dm_table_put(table); + dm_unlock_md_type(md); dm_table_create_fail: dm_put(md); dm_create_fail: From 26cab56d9af49e78dd1b395454f484003cc85575 Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Mon, 2 May 2016 17:29:28 +0200 Subject: [PATCH 0186/3220] ANDROID: dm: fix dm_substitute_devices() When candidate is the last parameter, candidate_end points to the '\0' character and not the DM_FIELD_SEP character. In such a situation, we should not move the candidate_end pointer one character backward. Signed-off-by: Jeremy Compostella --- init/do_mounts_dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c index f521bc5ae248..ecda58df9a19 100644 --- a/init/do_mounts_dm.c +++ b/init/do_mounts_dm.c @@ -176,7 +176,8 @@ static void __init dm_substitute_devices(char *str, size_t str_len) continue; /* Temporarily terminate with a nul */ - candidate_end--; + if (*candidate_end) + candidate_end--; old_char = *candidate_end; *candidate_end = '\0'; From 36d01a590d035804dc100e105facf4049cf6fe43 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 14 Dec 2015 20:09:39 -0800 Subject: [PATCH 0187/3220] ANDROID: dm: Add android verity target This device-mapper target is virtually a VERITY target. This target is setup by reading the metadata contents piggybacked to the actual data blocks in the block device. The signature of the metadata contents are verified against the key included in the system keyring. Upon success, the underlying verity target is setup. BUG: 27175947 Change-Id: I7e99644a0960ac8279f02c0158ed20999510ea97 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/Kconfig | 16 + drivers/md/Makefile | 4 + drivers/md/dm-android-verity.c | 771 +++++++++++++++++++++++++++++++++ drivers/md/dm-android-verity.h | 92 ++++ drivers/md/dm-verity-target.c | 12 +- drivers/md/dm-verity.h | 12 + 6 files changed, 901 insertions(+), 6 deletions(-) create mode 100644 drivers/md/dm-android-verity.c create mode 100644 drivers/md/dm-android-verity.h diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index d8b0ab6f3753..96b419b544ed 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -500,4 +500,20 @@ config DM_LOG_WRITES If unsure, say N. +config DM_ANDROID_VERITY + bool "Android verity target support" + depends on DM_VERITY + depends on X509_CERTIFICATE_PARSER + depends on SYSTEM_TRUSTED_KEYRING + depends on PUBLIC_KEY_ALGO_RSA + depends on KEYS + depends on ASYMMETRIC_KEY_TYPE + depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE + ---help--- + This device-mapper target is virtually a VERITY target. This + target is setup by reading the metadata contents piggybacked + to the actual data blocks in the block device. The signature + of the metadata contents are verified against the key included + in the system keyring. Upon success, the underlying verity + target is setup. endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 62a65764e8e0..c8fb00d8cc36 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -68,3 +68,7 @@ endif ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif + +ifeq ($(CONFIG_DM_ANDROID_VERITY),y) +dm-verity-objs += dm-android-verity.o +endif diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c new file mode 100644 index 000000000000..c77c9fa7a962 --- /dev/null +++ b/drivers/md/dm-android-verity.c @@ -0,0 +1,771 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "dm-verity.h" +#include "dm-android-verity.h" + +static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; +static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; + +static int __init verified_boot_state_param(char *line) +{ + strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate)); + return 1; +} + +__setup("androidboot.verifiedbootstate=", verified_boot_state_param); + +static int __init verity_mode_param(char *line) +{ + strlcpy(veritymode, line, sizeof(veritymode)); + return 1; +} + +__setup("androidboot.veritymode=", verity_mode_param); + +static int table_extract_mpi_array(struct public_key_signature *pks, + const void *data, size_t len) +{ + MPI mpi = mpi_read_raw_data(data, len); + + if (!mpi) { + DMERR("Error while allocating mpi array"); + return -ENOMEM; + } + + pks->mpi[0] = mpi; + pks->nr_mpi = 1; + return 0; +} + +static struct public_key_signature *table_make_digest( + enum pkey_hash_algo hash, + const void *table, + unsigned long table_len) +{ + struct public_key_signature *pks = NULL; + struct crypto_shash *tfm; + struct shash_desc *desc; + size_t digest_size, desc_size; + int ret; + + /* Allocate the hashing algorithm we're going to need and find out how + * big the hash operational data will be. + */ + tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + if (IS_ERR(tfm)) + return ERR_CAST(tfm); + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + digest_size = crypto_shash_digestsize(tfm); + + /* We allocate the hash operational data storage on the end of out + * context data and the digest output buffer on the end of that. + */ + ret = -ENOMEM; + pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); + if (!pks) + goto error; + + pks->pkey_hash_algo = hash; + pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; + pks->digest_size = digest_size; + + desc = (struct shash_desc *)(pks + 1); + desc->tfm = tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto error; + + ret = crypto_shash_finup(desc, table, table_len, pks->digest); + if (ret < 0) + goto error; + + crypto_free_shash(tfm); + return pks; + +error: + kfree(pks); + crypto_free_shash(tfm); + return ERR_PTR(ret); +} + +static int read_block_dev(struct bio_read *payload, struct block_device *bdev, + sector_t offset, int length) +{ + struct bio *bio; + int err = 0, i; + + payload->number_of_pages = DIV_ROUND_UP(length, PAGE_SIZE); + + bio = bio_alloc(GFP_KERNEL, payload->number_of_pages); + if (!bio) { + DMERR("Error while allocating bio"); + return -ENOMEM; + } + + bio->bi_bdev = bdev; + bio->bi_sector = offset; + + payload->page_io = kzalloc(sizeof(struct page *) * + payload->number_of_pages, GFP_KERNEL); + if (!payload->page_io) { + DMERR("page_io array alloc failed"); + err = -ENOMEM; + goto free_bio; + } + + for (i = 0; i < payload->number_of_pages; i++) { + payload->page_io[i] = alloc_page(GFP_KERNEL); + if (!payload->page_io[i]) { + DMERR("alloc_page failed"); + err = -ENOMEM; + goto free_pages; + } + if (!bio_add_page(bio, payload->page_io[i], PAGE_SIZE, 0)) { + DMERR("bio_add_page error"); + err = -EIO; + goto free_pages; + } + } + + if (!submit_bio_wait(READ, bio)) + /* success */ + goto free_bio; + DMERR("bio read failed"); + err = -EIO; + +free_pages: + for (i = 0; i < payload->number_of_pages; i++) + if (payload->page_io[i]) + __free_page(payload->page_io[i]); + kfree(payload->page_io); +free_bio: + bio_put(bio); + return err; +} + +static inline u64 fec_div_round_up(u64 x, u64 y) +{ + u64 remainder; + + return div64_u64_rem(x, y, &remainder) + + (remainder > 0 ? 1 : 0); +} + +static inline void populate_fec_metadata(struct fec_header *header, + struct fec_ecc_metadata *ecc) +{ + ecc->blocks = fec_div_round_up(le64_to_cpu(header->inp_size), + FEC_BLOCK_SIZE); + ecc->roots = le32_to_cpu(header->roots); + ecc->start = le64_to_cpu(header->inp_size); +} + +static inline int validate_fec_header(struct fec_header *header, u64 offset) +{ + /* move offset to make the sanity check work for backup header + * as well. */ + offset -= offset % FEC_BLOCK_SIZE; + if (le32_to_cpu(header->magic) != FEC_MAGIC || + le32_to_cpu(header->version) != FEC_VERSION || + le32_to_cpu(header->size) != sizeof(struct fec_header) || + le32_to_cpu(header->roots) == 0 || + le32_to_cpu(header->roots) >= FEC_RSM || + offset < le32_to_cpu(header->fec_size) || + offset - le32_to_cpu(header->fec_size) != + le64_to_cpu(header->inp_size)) + return -EINVAL; + + return 0; +} + +static int extract_fec_header(dev_t dev, struct fec_header *fec, + struct fec_ecc_metadata *ecc) +{ + u64 device_size; + struct bio_read payload; + int i, err = 0; + struct block_device *bdev; + + bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + + if (IS_ERR(bdev)) { + DMERR("bdev get error"); + return PTR_ERR(bdev); + } + + device_size = i_size_read(bdev->bd_inode); + + /* fec metadata size is a power of 2 and PAGE_SIZE + * is a power of 2 as well. + */ + BUG_ON(FEC_BLOCK_SIZE > PAGE_SIZE); + /* 512 byte sector alignment */ + BUG_ON(((device_size - FEC_BLOCK_SIZE) % (1 << SECTOR_SHIFT)) != 0); + + err = read_block_dev(&payload, bdev, (device_size - + FEC_BLOCK_SIZE) / (1 << SECTOR_SHIFT), FEC_BLOCK_SIZE); + if (err) { + DMERR("Error while reading verity metadata"); + goto error; + } + + BUG_ON(sizeof(struct fec_header) > PAGE_SIZE); + memcpy(fec, page_address(payload.page_io[0]), + sizeof(*fec)); + + ecc->valid = true; + if (validate_fec_header(fec, device_size - FEC_BLOCK_SIZE)) { + /* Try the backup header */ + memcpy(fec, page_address(payload.page_io[0]) + FEC_BLOCK_SIZE + - sizeof(*fec) , + sizeof(*fec)); + if (validate_fec_header(fec, device_size - + sizeof(struct fec_header))) + ecc->valid = false; + } + + if (ecc->valid) + populate_fec_metadata(fec, ecc); + + for (i = 0; i < payload.number_of_pages; i++) + __free_page(payload.page_io[i]); + kfree(payload.page_io); + +error: + blkdev_put(bdev, FMODE_READ); + return err; +} +static void find_metadata_offset(struct fec_header *fec, + struct block_device *bdev, u64 *metadata_offset) +{ + u64 device_size; + + device_size = i_size_read(bdev->bd_inode); + + if (le32_to_cpu(fec->magic) == FEC_MAGIC) + *metadata_offset = le64_to_cpu(fec->inp_size) - + VERITY_METADATA_SIZE; + else + *metadata_offset = device_size - VERITY_METADATA_SIZE; +} + +static struct android_metadata *extract_metadata(dev_t dev, + struct fec_header *fec) +{ + struct block_device *bdev; + struct android_metadata_header *header; + struct android_metadata *uninitialized_var(metadata); + int i; + u32 table_length, copy_length, offset; + u64 metadata_offset; + struct bio_read payload; + int err = 0; + + bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + + if (IS_ERR(bdev)) { + DMERR("blkdev_get_by_dev failed"); + return ERR_CAST(bdev); + } + + find_metadata_offset(fec, bdev, &metadata_offset); + + /* Verity metadata size is a power of 2 and PAGE_SIZE + * is a power of 2 as well. + * PAGE_SIZE is also a multiple of 512 bytes. + */ + if (VERITY_METADATA_SIZE > PAGE_SIZE) + BUG_ON(VERITY_METADATA_SIZE % PAGE_SIZE != 0); + /* 512 byte sector alignment */ + BUG_ON(metadata_offset % (1 << SECTOR_SHIFT) != 0); + + err = read_block_dev(&payload, bdev, metadata_offset / + (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE); + if (err) { + DMERR("Error while reading verity metadata"); + metadata = ERR_PTR(err); + goto blkdev_release; + } + + header = kzalloc(sizeof(*header), GFP_KERNEL); + if (!header) { + DMERR("kzalloc failed for header"); + err = -ENOMEM; + goto free_payload; + } + + memcpy(header, page_address(payload.page_io[0]), + sizeof(*header)); + + DMINFO("bio magic_number:%u protocol_version:%d table_length:%u", + le32_to_cpu(header->magic_number), + le32_to_cpu(header->protocol_version), + le32_to_cpu(header->table_length)); + + metadata = kzalloc(sizeof(*metadata), GFP_KERNEL); + if (!metadata) { + DMERR("kzalloc for metadata failed"); + err = -ENOMEM; + goto free_header; + } + + metadata->header = header; + table_length = le32_to_cpu(header->table_length); + + if (table_length == 0 || + table_length > (VERITY_METADATA_SIZE - + sizeof(struct android_metadata_header))) + goto free_metadata; + + metadata->verity_table = kzalloc(table_length + 1, GFP_KERNEL); + + if (!metadata->verity_table) { + DMERR("kzalloc verity_table failed"); + err = -ENOMEM; + goto free_metadata; + } + + if (sizeof(struct android_metadata_header) + + table_length <= PAGE_SIZE) { + memcpy(metadata->verity_table, page_address(payload.page_io[0]) + + sizeof(struct android_metadata_header), + table_length); + } else { + copy_length = PAGE_SIZE - + sizeof(struct android_metadata_header); + memcpy(metadata->verity_table, page_address(payload.page_io[0]) + + sizeof(struct android_metadata_header), + copy_length); + table_length -= copy_length; + offset = copy_length; + i = 1; + while (table_length != 0) { + if (table_length > PAGE_SIZE) { + memcpy(metadata->verity_table + offset, + page_address(payload.page_io[i]), + PAGE_SIZE); + offset += PAGE_SIZE; + table_length -= PAGE_SIZE; + } else { + memcpy(metadata->verity_table + offset, + page_address(payload.page_io[i]), + table_length); + table_length = 0; + } + i++; + } + } + metadata->verity_table[table_length] = '\0'; + + goto free_payload; + +free_metadata: + kfree(metadata); +free_header: + kfree(header); + metadata = ERR_PTR(err); +free_payload: + for (i = 0; i < payload.number_of_pages; i++) + if (payload.page_io[i]) + __free_page(payload.page_io[i]); + kfree(payload.page_io); + + DMINFO("verity_table: %s", metadata->verity_table); +blkdev_release: + blkdev_put(bdev, FMODE_READ); + return metadata; +} + +/* helper functions to extract properties from dts */ +const char *find_dt_value(const char *name) +{ + struct device_node *firmware; + const char *value; + + firmware = of_find_node_by_path("/firmware/android"); + if (!firmware) + return NULL; + value = of_get_property(firmware, name, NULL); + of_node_put(firmware); + + return value; +} + +static bool is_unlocked(void) +{ + static const char unlocked[] = "orange"; + static const char verified_boot_prop[] = "verifiedbootstate"; + const char *value; + + value = find_dt_value(verified_boot_prop); + if (!value) + value = verifiedbootstate; + + return !strncmp(value, unlocked, sizeof(unlocked) - 1); +} + +static int verity_mode(void) +{ + static const char enforcing[] = "enforcing"; + static const char verified_mode_prop[] = "veritymode"; + const char *value; + + value = find_dt_value(verified_mode_prop); + if (!value) + value = veritymode; + if (!strncmp(value, enforcing, sizeof(enforcing) - 1)) + return DM_VERITY_MODE_RESTART; + + return DM_VERITY_MODE_EIO; +} + +static int verify_header(struct android_metadata_header *header) +{ + int retval = -EINVAL; + + if (is_unlocked() && le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE) { + retval = VERITY_STATE_DISABLE; + return retval; + } + + if (!(le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_NUMBER) || + (le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE)) { + DMERR("Incorrect magic number"); + return retval; + } + + if (le32_to_cpu(header->protocol_version) != + VERITY_METADATA_VERSION) { + DMERR("Unsupported version %u", + le32_to_cpu(header->protocol_version)); + return retval; + } + + return 0; +} + +static int verify_verity_signature(char *key_id, + struct android_metadata *metadata) +{ + key_ref_t key_ref; + struct key *key; + struct public_key_signature *pks = NULL; + int retval = -EINVAL; + + key_ref = keyring_search(make_key_ref(system_trusted_keyring, 1), + &key_type_asymmetric, key_id); + + if (IS_ERR(key_ref)) { + DMERR("keyring: key not found"); + return -ENOKEY; + } + + key = key_ref_to_ptr(key_ref); + + pks = table_make_digest(PKEY_HASH_SHA256, + (const void *)metadata->verity_table, + le32_to_cpu(metadata->header->table_length)); + + if (IS_ERR(pks)) { + DMERR("hashing failed"); + goto error; + } + + retval = table_extract_mpi_array(pks, &metadata->header->signature[0], + RSANUMBYTES); + if (retval < 0) { + DMERR("Error extracting mpi %d", retval); + goto error; + } + + retval = verify_signature(key, pks); + mpi_free(pks->rsa.s); +error: + kfree(pks); + key_put(key); + + return retval; +} + +static void handle_error(void) +{ + int mode = verity_mode(); + if (mode == DM_VERITY_MODE_RESTART) { + DMERR("triggering restart"); + kernel_restart("dm-verity device corrupted"); + } else { + DMERR("Mounting verity root failed"); + } +} + +static inline bool test_mult_overflow(sector_t a, u32 b) +{ + sector_t r = (sector_t)~0ULL; + + sector_div(r, b); + return a > r; +} + +/* + * Target parameters: + * Key id of the public key in the system keyring. + * Verity metadata's signature would be verified against + * this. If the key id contains spaces, replace them + * with '#'. + * The block device for which dm-verity is being setup. + */ +static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + dev_t uninitialized_var(dev); + struct android_metadata *uninitialized_var(metadata); + int err = 0, i, mode; + char *key_id, *table_ptr, dummy, + *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; + /* One for specifying number of opt args and one for mode */ + sector_t data_sectors; + u32 data_block_size; + unsigned int major, minor, + no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; + struct fec_header fec; + struct fec_ecc_metadata uninitialized_var(ecc); + char buf[FEC_ARG_LENGTH], *buf_ptr; + unsigned long long tmpll; + + if (argc != 2) { + DMERR("Incorrect number of arguments"); + handle_error(); + return -EINVAL; + } + + /* should come as one of the arguments for the verity target */ + key_id = argv[0]; + strreplace(argv[0], '#', ' '); + + if (sscanf(argv[1], "%u:%u%c", &major, &minor, &dummy) == 2) { + dev = MKDEV(major, minor); + if (MAJOR(dev) != major || MINOR(dev) != minor) { + DMERR("Incorrect bdev major minor number"); + handle_error(); + return -EOVERFLOW; + } + } + + DMINFO("key:%s dev:%s", argv[0], argv[1]); + + if (extract_fec_header(dev, &fec, &ecc)) { + DMERR("Error while extracting fec header"); + handle_error(); + return -EINVAL; + } + + metadata = extract_metadata(dev, &fec); + + if (IS_ERR(metadata)) { + DMERR("Error while extracting metadata"); + handle_error(); + return -EINVAL; + } + + err = verify_header(metadata->header); + + if (err == VERITY_STATE_DISABLE) { + DMERR("Mounting root with verity disabled"); + return -EINVAL; + } else if (err) { + DMERR("Verity header handle error"); + handle_error(); + goto free_metadata; + } + + err = verify_verity_signature(key_id, metadata); + + if (err) { + DMERR("Signature verification failed"); + handle_error(); + goto free_metadata; + } else + DMINFO("Signature verification success"); + + table_ptr = metadata->verity_table; + + for (i = 0; i < VERITY_TABLE_ARGS; i++) { + verity_table_args[i] = strsep(&table_ptr, " "); + if (verity_table_args[i] == NULL) + break; + } + + if (i != VERITY_TABLE_ARGS) { + DMERR("Verity table not in the expected format"); + err = -EINVAL; + handle_error(); + goto free_metadata; + } + + if (sscanf(verity_table_args[5], "%llu%c", &tmpll, &dummy) + != 1) { + DMERR("Verity table not in the expected format"); + handle_error(); + err = -EINVAL; + goto free_metadata; + } + + if (tmpll > ULONG_MAX) { + DMERR(" too large. Forgot to turn on CONFIG_LBDAF?"); + handle_error(); + err = -EINVAL; + goto free_metadata; + } + + data_sectors = tmpll; + + if (sscanf(verity_table_args[3], "%u%c", &data_block_size, &dummy) + != 1) { + DMERR("Verity table not in the expected format"); + handle_error(); + err = -EINVAL; + goto free_metadata; + } + + if (test_mult_overflow(data_sectors, data_block_size >> + SECTOR_SHIFT)) { + DMERR("data_sectors too large"); + handle_error(); + err = -EOVERFLOW; + goto free_metadata; + } + + data_sectors *= data_block_size >> SECTOR_SHIFT; + DMINFO("Data sectors %llu", (unsigned long long)data_sectors); + + /* update target length */ + ti->len = data_sectors; + + /*substitute data_dev and hash_dev*/ + verity_table_args[1] = argv[1]; + verity_table_args[2] = argv[1]; + + mode = verity_mode(); + + if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) { + if (mode) { + err = snprintf(buf, FEC_ARG_LENGTH, + "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, + 1 + VERITY_TABLE_OPT_FEC_ARGS, + mode == DM_VERITY_MODE_RESTART ? + VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING, + argv[1], ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); + } else { + err = snprintf(buf, FEC_ARG_LENGTH, + "%u " VERITY_TABLE_OPT_FEC_FORMAT, + VERITY_TABLE_OPT_FEC_ARGS, argv[1], + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, ecc.roots); + } + } else if (mode) { + err = snprintf(buf, FEC_ARG_LENGTH, + "2 " VERITY_TABLE_OPT_IGNZERO " %s", + mode == DM_VERITY_MODE_RESTART ? + VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING); + } else { + err = snprintf(buf, FEC_ARG_LENGTH, "1 %s", + "ignore_zero_blocks"); + } + + if (err < 0 || err >= FEC_ARG_LENGTH) + goto free_metadata; + + buf_ptr = buf; + + for (i = VERITY_TABLE_ARGS; i < (VERITY_TABLE_ARGS + + VERITY_TABLE_OPT_FEC_ARGS + 2); i++) { + verity_table_args[i] = strsep(&buf_ptr, " "); + if (verity_table_args[i] == NULL) { + no_of_args = i; + break; + } + } + + err = verity_ctr(ti, no_of_args, verity_table_args); + +free_metadata: + kfree(metadata->header); + kfree(metadata->verity_table); + kfree(metadata); + return err; +} + +static struct target_type android_verity_target = { + .name = "android-verity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = android_verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .ioctl = verity_ioctl, + .merge = verity_merge, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + +static int __init dm_android_verity_init(void) +{ + int r; + + r = dm_register_target(&android_verity_target); + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_android_verity_exit(void) +{ + dm_unregister_target(&android_verity_target); +} + +module_init(dm_android_verity_init); +module_exit(dm_android_verity_exit); diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h new file mode 100644 index 000000000000..11477ffd2243 --- /dev/null +++ b/drivers/md/dm-android-verity.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2015 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef DM_ANDROID_VERITY_H +#define DM_ANDROID_VERITY_H + +#include + +#define RSANUMBYTES 256 +#define VERITY_METADATA_MAGIC_NUMBER 0xb001b001 +#define VERITY_METADATA_MAGIC_DISABLE 0x46464f56 +#define VERITY_METADATA_VERSION 0 +#define VERITY_STATE_DISABLE 1 +#define DATA_BLOCK_SIZE (4 * 1024) +#define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE) +#define VERITY_TABLE_ARGS 10 +#define VERITY_COMMANDLINE_PARAM_LENGTH 20 + +#define FEC_MAGIC 0xFECFECFE +#define FEC_BLOCK_SIZE (4 * 1024) +#define FEC_VERSION 0 +#define FEC_RSM 255 +#define FEC_ARG_LENGTH 300 + +#define VERITY_TABLE_OPT_RESTART "restart_on_corruption" +#define VERITY_TABLE_OPT_LOGGING "ignore_corruption" +#define VERITY_TABLE_OPT_IGNZERO "ignore_zero_blocks" + +#define VERITY_TABLE_OPT_FEC_FORMAT \ + "use_fec_from_device %s fec_start %llu fec_blocks %llu fec_roots %u ignore_zero_blocks" +#define VERITY_TABLE_OPT_FEC_ARGS 9 + +#define VERITY_DEBUG 0 + +#define DM_MSG_PREFIX "android-verity" +/* + * There can be two formats. + * if fec is present + * + * if fec is not present + * + */ +/* TODO: rearrange structure to reduce memory holes + * depends on userspace change. + */ +struct fec_header { + __le32 magic; + __le32 version; + __le32 size; + __le32 roots; + __le32 fec_size; + __le64 inp_size; + u8 hash[SHA256_DIGEST_SIZE]; +}; + +struct android_metadata_header { + __le32 magic_number; + __le32 protocol_version; + char signature[RSANUMBYTES]; + __le32 table_length; +}; + +struct android_metadata { + struct android_metadata_header *header; + char *verity_table; +}; + +struct fec_ecc_metadata { + bool valid; + u32 roots; + u64 blocks; + u64 rounds; + u64 start; +}; + +struct bio_read { + struct page **page_io; + int number_of_pages; +}; + +#endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5c5d30cb6ec5..65835f15a116 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -551,7 +551,7 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) * Bio map function. It allocates dm_verity_io structure and bio vector and * fills them. Then it issues prefetches and the I/O. */ -static int verity_map(struct dm_target *ti, struct bio *bio) +int verity_map(struct dm_target *ti, struct bio *bio) { struct dm_verity *v = ti->private; struct dm_verity_io *io; @@ -596,7 +596,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) /* * Status: V (valid) or C (corruption found) */ -static void verity_status(struct dm_target *ti, status_type_t type, +void verity_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct dm_verity *v = ti->private; @@ -669,7 +669,7 @@ static int verity_prepare_ioctl(struct dm_target *ti, return 0; } -static int verity_iterate_devices(struct dm_target *ti, +int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct dm_verity *v = ti->private; @@ -677,7 +677,7 @@ static int verity_iterate_devices(struct dm_target *ti, return fn(ti, v->data_dev, v->data_start, ti->len, data); } -static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) +void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct dm_verity *v = ti->private; @@ -690,7 +690,7 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, limits->logical_block_size); } -static void verity_dtr(struct dm_target *ti) +void verity_dtr(struct dm_target *ti) { struct dm_verity *v = ti->private; @@ -817,7 +817,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) * * Hex string or "-" if no salt. */ -static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) +int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { struct dm_verity *v; struct dm_arg_set as; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index fb419f422d73..d9cf5e4939eb 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -126,4 +126,16 @@ extern int verity_hash(struct dm_verity *v, struct shash_desc *desc, extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, sector_t block, u8 *digest, bool *is_zero); +extern void verity_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen); +extern int verity_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg); +extern int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size); +extern int verity_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data); +extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits); +extern void verity_dtr(struct dm_target *ti); +extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv); +extern int verity_map(struct dm_target *ti, struct bio *bio); #endif /* DM_VERITY_H */ From f42f971b7b59257a8610608b7f45a046c7c57b5d Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 8 Feb 2016 16:28:43 -0800 Subject: [PATCH 0188/3220] ANDROID: dm-android-verity: Rebase on top of 4.1 Following CLs in upstream causes minor changes to dm-android-verity target. 1. keys: change asymmetric keys to use common hash definitions 2. block: Abstract out bvec iterator Rebase dm-android-verity on top of these changes. Bug: 27175947 Signed-off-by: Badhri Jagan Sridharan Change-Id: Icfdc3e7b3ead5de335a059cade1aca70414db415 --- drivers/md/dm-android-verity.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index c77c9fa7a962..aeb5045830d9 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -75,7 +75,7 @@ static int table_extract_mpi_array(struct public_key_signature *pks, } static struct public_key_signature *table_make_digest( - enum pkey_hash_algo hash, + enum hash_algo hash, const void *table, unsigned long table_len) { @@ -88,7 +88,7 @@ static struct public_key_signature *table_make_digest( /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); if (IS_ERR(tfm)) return ERR_CAST(tfm); @@ -143,7 +143,7 @@ static int read_block_dev(struct bio_read *payload, struct block_device *bdev, } bio->bi_bdev = bdev; - bio->bi_sector = offset; + bio->bi_iter.bi_sector = offset; payload->page_io = kzalloc(sizeof(struct page *) * payload->number_of_pages, GFP_KERNEL); @@ -505,7 +505,7 @@ static int verify_verity_signature(char *key_id, key = key_ref_to_ptr(key_ref); - pks = table_make_digest(PKEY_HASH_SHA256, + pks = table_make_digest(HASH_ALGO_SHA256, (const void *)metadata->verity_table, le32_to_cpu(metadata->header->table_length)); @@ -569,7 +569,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) u32 data_block_size; unsigned int major, minor, no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; - struct fec_header fec; + struct fec_header uninitialized_var(fec); struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; From 67ce481897e0656404fe0af42d563101a5629bbe Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 21 Mar 2016 10:55:23 -0700 Subject: [PATCH 0189/3220] ANDROID: dm: Mounting root as linear device when verity disabled This CL makes android-verity target to be added as linear dm device if when bootloader is unlocked and verity is disabled. Bug: 27175947 Change-Id: Ic41ca4b8908fb2777263799cf3a3e25934d70f18 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/dm-android-verity.c | 131 +++++++++++++++++++++++++++------ drivers/md/dm-android-verity.h | 3 + drivers/md/dm-linear.c | 2 +- 3 files changed, 113 insertions(+), 23 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index aeb5045830d9..db4ddf789a39 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include @@ -43,6 +44,25 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; +static bool target_added; +static bool verity_enabled = true; +struct dentry *debug_dir; +static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv); + +static struct target_type android_verity_target = { + .name = "android-verity", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = android_verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .ioctl = verity_ioctl, + .merge = verity_merge, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + static int __init verified_boot_state_param(char *line) { strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate)); @@ -549,6 +569,35 @@ static inline bool test_mult_overflow(sector_t a, u32 b) return a > r; } +static int add_as_linear_device(struct dm_target *ti, char *dev) +{ + /*Move to linear mapping defines*/ + char *linear_table_args[DM_LINEAR_ARGS]; + char offset[] = "0"; + int err = 0; + + linear_table_args[0] = dev; + linear_table_args[1] = offset; + + android_verity_target.dtr = linear_target.dtr, + android_verity_target.map = linear_target.map, + android_verity_target.status = linear_target.status, + android_verity_target.ioctl = linear_target.ioctl, + android_verity_target.merge = linear_target.merge, + android_verity_target.iterate_devices = linear_target.iterate_devices, + android_verity_target.io_hints = NULL; + + err = linear_target.ctr(ti, DM_LINEAR_ARGS, linear_table_args); + + if (!err) { + DMINFO("Added android-verity as a linear target"); + target_added = true; + } else + DMERR("Failed to add android-verity as linear target"); + + return err; +} + /* * Target parameters: * Key id of the public key in the system keyring. @@ -613,21 +662,27 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (err == VERITY_STATE_DISABLE) { DMERR("Mounting root with verity disabled"); - return -EINVAL; + verity_enabled = false; + /* we would still have to parse the args to figure out + * the data blocks size. Or may be could map the entire + * partition similar to mounting the device. + */ } else if (err) { DMERR("Verity header handle error"); handle_error(); goto free_metadata; } - err = verify_verity_signature(key_id, metadata); + if (!verity_enabled) { + err = verify_verity_signature(key_id, metadata); - if (err) { - DMERR("Signature verification failed"); - handle_error(); - goto free_metadata; - } else - DMINFO("Signature verification success"); + if (err) { + DMERR("Signature verification failed"); + handle_error(); + goto free_metadata; + } else + DMINFO("Signature verification success"); + } table_ptr = metadata->verity_table; @@ -683,6 +738,12 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* update target length */ ti->len = data_sectors; + /* Setup linear target and free */ + if (!verity_enabled) { + err = add_as_linear_device(ti, argv[1]); + goto free_metadata; + } + /*substitute data_dev and hash_dev*/ verity_table_args[1] = argv[1]; verity_table_args[2] = argv[1]; @@ -730,6 +791,13 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) err = verity_ctr(ti, no_of_args, verity_table_args); + if (err) + DMERR("android-verity failed to mount as verity target"); + else { + target_added = true; + DMINFO("android-verity mounted as verity target"); + } + free_metadata: kfree(metadata->header); kfree(metadata->verity_table); @@ -737,33 +805,52 @@ free_metadata: return err; } -static struct target_type android_verity_target = { - .name = "android-verity", - .version = {1, 0, 0}, - .module = THIS_MODULE, - .ctr = android_verity_ctr, - .dtr = verity_dtr, - .map = verity_map, - .status = verity_status, - .ioctl = verity_ioctl, - .merge = verity_merge, - .iterate_devices = verity_iterate_devices, - .io_hints = verity_io_hints, -}; - static int __init dm_android_verity_init(void) { int r; + struct dentry *file; r = dm_register_target(&android_verity_target); if (r < 0) DMERR("register failed %d", r); + /* Tracks the status of the last added target */ + debug_dir = debugfs_create_dir("android_verity", NULL); + + if (IS_ERR_OR_NULL(debug_dir)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + goto end; + } + + file = debugfs_create_bool("target_added", S_IRUGO, debug_dir, + (u32 *)&target_added); + + if (IS_ERR_OR_NULL(file)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + debugfs_remove_recursive(debug_dir); + goto end; + } + + file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir, + (u32 *)&verity_enabled); + + if (IS_ERR_OR_NULL(file)) { + DMERR("Cannot create android_verity debugfs directory: %ld", + PTR_ERR(debug_dir)); + debugfs_remove_recursive(debug_dir); + } + +end: return r; } static void __exit dm_android_verity_exit(void) { + if (!IS_ERR_OR_NULL(debug_dir)) + debugfs_remove_recursive(debug_dir); + dm_unregister_target(&android_verity_target); } diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 11477ffd2243..2cf7de1b7910 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -44,6 +44,8 @@ #define VERITY_DEBUG 0 #define DM_MSG_PREFIX "android-verity" + +#define DM_LINEAR_ARGS 2 /* * There can be two formats. * if fec is present @@ -89,4 +91,5 @@ struct bio_read { int number_of_pages; }; +extern struct target_type linear_target; #endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 05c35aacb3aa..d8bf876e7bed 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -141,7 +141,7 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } -static struct target_type linear_target = { +struct target_type linear_target = { .name = "linear", .version = {1, 2, 1}, .module = THIS_MODULE, From 438e162621ac7e80ec9c81bea3f0eaec0b801ff9 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 28 Mar 2016 14:41:21 -0700 Subject: [PATCH 0190/3220] ANDROID: dm: Minor cleanup Compacts the linear device arguments removing the unnecessary variables. Bug: 27175947 Change-Id: I157170eebe3c0f89a68ae05870a1060f188d0da0 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/dm-android-verity.c | 7 ++----- drivers/md/dm-android-verity.h | 2 ++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index db4ddf789a39..f6ddbee5e2d3 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -572,13 +572,10 @@ static inline bool test_mult_overflow(sector_t a, u32 b) static int add_as_linear_device(struct dm_target *ti, char *dev) { /*Move to linear mapping defines*/ - char *linear_table_args[DM_LINEAR_ARGS]; - char offset[] = "0"; + char *linear_table_args[DM_LINEAR_ARGS] = {dev, + DM_LINEAR_TARGET_OFFSET}; int err = 0; - linear_table_args[0] = dev; - linear_table_args[1] = offset; - android_verity_target.dtr = linear_target.dtr, android_verity_target.map = linear_target.map, android_verity_target.status = linear_target.status, diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 2cf7de1b7910..fe53863c664b 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -46,6 +46,8 @@ #define DM_MSG_PREFIX "android-verity" #define DM_LINEAR_ARGS 2 +#define DM_LINEAR_TARGET_OFFSET "0" + /* * There can be two formats. * if fec is present From 86fd82659f745b5fea48090fe320cb817e199805 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 5 Apr 2016 11:18:16 -0700 Subject: [PATCH 0191/3220] ANDROID: dm: rename dm-linear methods for dm-android-verity This keeps linear_target as static variable and just exposes the linear target methods for android-verity Cherry-picked: https://android-review.googlesource.com/#/c/212858 Change-Id: I4a377e417b00afd9ecccdb3e605fea31a7df112e Signed-off-by: Badhri Jagan Sridharan (cherry picked from commit a6d1b091f40b25d97849487e29ec097bc5f568dd) --- drivers/md/dm-android-verity.c | 14 +++++++------- drivers/md/dm-android-verity.h | 12 ++++++++++++ drivers/md/dm-linear.c | 26 +++++++++++++------------- 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index f6ddbee5e2d3..b7e059595f75 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -576,15 +576,15 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) DM_LINEAR_TARGET_OFFSET}; int err = 0; - android_verity_target.dtr = linear_target.dtr, - android_verity_target.map = linear_target.map, - android_verity_target.status = linear_target.status, - android_verity_target.ioctl = linear_target.ioctl, - android_verity_target.merge = linear_target.merge, - android_verity_target.iterate_devices = linear_target.iterate_devices, + android_verity_target.dtr = dm_linear_dtr, + android_verity_target.map = dm_linear_map, + android_verity_target.status = dm_linear_status, + android_verity_target.ioctl = dm_linear_ioctl, + android_verity_target.merge = dm_linear_merge, + android_verity_target.iterate_devices = dm_linear_iterate_devices, android_verity_target.io_hints = NULL; - err = linear_target.ctr(ti, DM_LINEAR_ARGS, linear_table_args); + err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args); if (!err) { DMINFO("Added android-verity as a linear target"); diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index fe53863c664b..efb796524896 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -94,4 +94,16 @@ struct bio_read { }; extern struct target_type linear_target; + +extern void dm_linear_dtr(struct dm_target *ti); +extern int dm_linear_map(struct dm_target *ti, struct bio *bio); +extern void dm_linear_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen); +extern int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd, + unsigned long arg); +extern int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size); +extern int dm_linear_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data); +extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv); #endif /* DM_ANDROID_VERITY_H */ diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index d8bf876e7bed..74caca2888a6 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -25,7 +25,7 @@ struct linear_c { /* * Construct a linear mapping: */ -static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) +int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct linear_c *lc; unsigned long long tmp; @@ -67,7 +67,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) return ret; } -static void linear_dtr(struct dm_target *ti) +void dm_linear_dtr(struct dm_target *ti) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -92,14 +92,14 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) linear_map_sector(ti, bio->bi_iter.bi_sector); } -static int linear_map(struct dm_target *ti, struct bio *bio) +int dm_linear_map(struct dm_target *ti, struct bio *bio) { linear_map_bio(ti, bio); return DM_MAPIO_REMAPPED; } -static void linear_status(struct dm_target *ti, status_type_t type, +void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -116,7 +116,7 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_prepare_ioctl(struct dm_target *ti, +static int dm_linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -133,7 +133,7 @@ static int linear_prepare_ioctl(struct dm_target *ti, return 0; } -static int linear_iterate_devices(struct dm_target *ti, +int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct linear_c *lc = ti->private; @@ -141,16 +141,16 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } -struct target_type linear_target = { +static struct target_type linear_target = { .name = "linear", .version = {1, 2, 1}, .module = THIS_MODULE, - .ctr = linear_ctr, - .dtr = linear_dtr, - .map = linear_map, - .status = linear_status, - .prepare_ioctl = linear_prepare_ioctl, - .iterate_devices = linear_iterate_devices, + .ctr = dm_linear_ctr, + .dtr = dm_linear_dtr, + .map = dm_linear_map, + .status = dm_linear_status, + .prepare_ioctl = dm_linear_prepare_ioctl, + .iterate_devices = dm_linear_iterate_devices, }; int __init dm_linear_init(void) From a517817c1708dd9d762d37d7dd475d4728a4b34e Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Fri, 15 Apr 2016 13:32:54 +0200 Subject: [PATCH 0192/3220] ANDROID: dm: use name_to_dev_t This patch makes android_verity_ctr() parse its block device string parameter with name_to_dev_t(). It allows the use of less hardware related block device reference like PARTUUID for instance. Change-Id: Idb84453e70cc11abd5ef3a0adfbb16f8b5feaf07 Signed-off-by: Jeremy Compostella --- drivers/md/dm-android-verity.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index b7e059595f75..9c26cbb5f179 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -613,8 +613,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* One for specifying number of opt args and one for mode */ sector_t data_sectors; u32 data_block_size; - unsigned int major, minor, - no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; + unsigned int no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS; struct fec_header uninitialized_var(fec); struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; @@ -630,13 +629,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) key_id = argv[0]; strreplace(argv[0], '#', ' '); - if (sscanf(argv[1], "%u:%u%c", &major, &minor, &dummy) == 2) { - dev = MKDEV(major, minor); - if (MAJOR(dev) != major || MINOR(dev) != minor) { - DMERR("Incorrect bdev major minor number"); - handle_error(); - return -EOVERFLOW; - } + dev = name_to_dev_t(argv[1]); + if (!dev) { + DMERR("no dev found for %s", argv[1]); + handle_error(); + return -EINVAL; } DMINFO("key:%s dev:%s", argv[0], argv[1]); From 9c43aca47bbdf652fc0d05d8e94ee43aaeea2458 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 20 May 2016 16:44:19 -0700 Subject: [PATCH 0193/3220] ANDROID: dm: fix signature verification flag The bug was that the signature verification was only happening when verity was disabled. It should always happen when verity is enabled. Signed-off-by: Badhri Jagan Sridharan Change-Id: I2d9354e240d36ea06fc68c2d18d8e87b823a4c2f (cherry picked from commit 5364b5ca0b1a12a58283b51408e43fc36d4e4fe7) --- drivers/md/dm-android-verity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 9c26cbb5f179..00275a986d03 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -667,7 +667,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto free_metadata; } - if (!verity_enabled) { + if (verity_enabled) { err = verify_verity_signature(key_id, metadata); if (err) { From 051d4706c621a73d3622f0b0d91a444ab37a8f1a Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 20 May 2016 16:45:45 -0700 Subject: [PATCH 0194/3220] ANDROID: dm: use default verity public key If the dm-android-verity target does not provide a default key try using the default public key from the system keyring. The defualt verity keyid is passed as a kernel command line argument veritykeyid=. The order of the dm-android-verity params have been reversed to facilitate the change. Old format example: dm="system none ro,0 1 android-verity Android:#7e4333f9bba00adfe0ede979e28ed1920492b40f /dev/mmcblk0p43" New formats supported: dm="system none ro,0 1 android-verity /dev/mmcblk0p43 Android:#7e4333f9bba00adfe0ede979e28ed1920492b40f" (or) dm="system none ro,0 1 android-verity /dev/mmcblk0p43" when veritykeyid= is set in the kernel command line. BUG: 28384658 Signed-off-by: Badhri Jagan Sridharan Change-Id: I506c89b053d835ab579e703eef2bc1f8487250de (cherry picked from commit c5c74d0327729f35b576564976885596c6d0e7fb) --- drivers/md/dm-android-verity.c | 67 ++++++++++++++++++++++++---------- drivers/md/dm-android-verity.h | 16 ++++++++ 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 00275a986d03..097fb2b1de89 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -43,6 +43,7 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; +static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH]; static bool target_added; static bool verity_enabled = true; @@ -79,6 +80,19 @@ static int __init verity_mode_param(char *line) __setup("androidboot.veritymode=", verity_mode_param); +static int __init verity_keyid_param(char *line) +{ + strlcpy(veritykeyid, line, sizeof(veritykeyid)); + return 1; +} + +__setup("veritykeyid=", verity_keyid_param); + +static inline bool default_verity_key_id(void) +{ + return veritykeyid[0] != '\0'; +} + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -608,7 +622,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) dev_t uninitialized_var(dev); struct android_metadata *uninitialized_var(metadata); int err = 0, i, mode; - char *key_id, *table_ptr, dummy, + char *key_id, *table_ptr, dummy, *target_device, *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; /* One for specifying number of opt args and one for mode */ sector_t data_sectors; @@ -619,24 +633,34 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; - if (argc != 2) { + if (argc == 1) { + /* Use the default keyid */ + if (default_verity_key_id()) + key_id = veritykeyid; + else { + DMERR("veritykeyid= is not set"); + handle_error(); + return -EINVAL; + } + } else if (argc == 2) + key_id = argv[1]; + else { DMERR("Incorrect number of arguments"); handle_error(); return -EINVAL; } - /* should come as one of the arguments for the verity target */ - key_id = argv[0]; - strreplace(argv[0], '#', ' '); + strreplace(key_id, '#', ' '); + target_device = argv[0]; - dev = name_to_dev_t(argv[1]); + dev = name_to_dev_t(target_device); if (!dev) { - DMERR("no dev found for %s", argv[1]); + DMERR("no dev found for %s", target_device); handle_error(); return -EINVAL; } - DMINFO("key:%s dev:%s", argv[0], argv[1]); + DMINFO("key:%s dev:%s", key_id, target_device); if (extract_fec_header(dev, &fec, &ecc)) { DMERR("Error while extracting fec header"); @@ -734,30 +758,33 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) /* Setup linear target and free */ if (!verity_enabled) { - err = add_as_linear_device(ti, argv[1]); + err = add_as_linear_device(ti, target_device); goto free_metadata; } /*substitute data_dev and hash_dev*/ - verity_table_args[1] = argv[1]; - verity_table_args[2] = argv[1]; + verity_table_args[1] = target_device; + verity_table_args[2] = target_device; mode = verity_mode(); if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) { if (mode) { err = snprintf(buf, FEC_ARG_LENGTH, - "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, - 1 + VERITY_TABLE_OPT_FEC_ARGS, - mode == DM_VERITY_MODE_RESTART ? - VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING, - argv[1], ecc.start / FEC_BLOCK_SIZE, ecc.blocks, - ecc.roots); + "%u %s " VERITY_TABLE_OPT_FEC_FORMAT, + 1 + VERITY_TABLE_OPT_FEC_ARGS, + mode == DM_VERITY_MODE_RESTART ? + VERITY_TABLE_OPT_RESTART : + VERITY_TABLE_OPT_LOGGING, + target_device, + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); } else { err = snprintf(buf, FEC_ARG_LENGTH, - "%u " VERITY_TABLE_OPT_FEC_FORMAT, - VERITY_TABLE_OPT_FEC_ARGS, argv[1], - ecc.start / FEC_BLOCK_SIZE, ecc.blocks, ecc.roots); + "%u " VERITY_TABLE_OPT_FEC_FORMAT, + VERITY_TABLE_OPT_FEC_ARGS, target_device, + ecc.start / FEC_BLOCK_SIZE, ecc.blocks, + ecc.roots); } } else if (mode) { err = snprintf(buf, FEC_ARG_LENGTH, diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index efb796524896..43655ee0f813 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -27,6 +27,22 @@ #define VERITY_TABLE_ARGS 10 #define VERITY_COMMANDLINE_PARAM_LENGTH 20 +/* + * : is the format for the identifier. + * subject can either be the Common Name(CN) + Organization Name(O) or + * just the CN if the it is prefixed with O + * From https://tools.ietf.org/html/rfc5280#appendix-A + * ub-organization-name-length INTEGER ::= 64 + * ub-common-name-length INTEGER ::= 64 + * + * http://lxr.free-electrons.com/source/crypto/asymmetric_keys/x509_cert_parser.c?v=3.9#L278 + * ctx->o_size + 2 + ctx->cn_size + 1 + * + 41 characters for ":" and sha1 id + * 64 + 2 + 64 + 1 + 1 + 40 (172) + * setting VERITY_DEFAULT_KEY_ID_LENGTH to 200 characters. + */ +#define VERITY_DEFAULT_KEY_ID_LENGTH 200 + #define FEC_MAGIC 0xFECFECFE #define FEC_BLOCK_SIZE (4 * 1024) #define FEC_VERSION 0 From 58bae772a7018ebfa735c91af22e26474a1dfcc7 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 17 Jun 2016 18:54:35 -0700 Subject: [PATCH 0195/3220] ANDROID: dm: mount as linear target if eng build eng builds dont have verity enabled i.e it does even have verity metadata appended to the parition. Therefore add rootdev as linear device and map the entire partition if build variant is "eng". (Cherry-picked based on https://partner-android-review.git.corp.google.com/#/c/618690/) BUG: 29276559 Signed-off-by: Badhri Jagan Sridharan Change-Id: I8f5c2289b842b820ca04f5773525e5449bb3f355 --- drivers/md/dm-android-verity.c | 62 +++++++++++++++++++++++++++++++--- drivers/md/dm-android-verity.h | 1 + 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 097fb2b1de89..e1a8e284e7e4 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -44,6 +44,7 @@ static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH]; static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH]; +static char buildvariant[BUILD_VARIANT]; static bool target_added; static bool verity_enabled = true; @@ -88,11 +89,26 @@ static int __init verity_keyid_param(char *line) __setup("veritykeyid=", verity_keyid_param); +static int __init verity_buildvariant(char *line) +{ + strlcpy(buildvariant, line, sizeof(buildvariant)); + return 1; +} + +__setup("buildvariant=", verity_buildvariant); + static inline bool default_verity_key_id(void) { return veritykeyid[0] != '\0'; } +static inline bool is_eng(void) +{ + static const char typeeng[] = "eng"; + + return !strncmp(buildvariant, typeeng, sizeof(typeeng)); +} + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -262,7 +278,7 @@ static int extract_fec_header(dev_t dev, struct fec_header *fec, bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { + if (IS_ERR_OR_NULL(bdev)) { DMERR("bdev get error"); return PTR_ERR(bdev); } @@ -323,6 +339,24 @@ static void find_metadata_offset(struct fec_header *fec, *metadata_offset = device_size - VERITY_METADATA_SIZE; } +static int find_size(dev_t dev, u64 *device_size) +{ + struct block_device *bdev; + + bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR_OR_NULL(bdev)) { + DMERR("blkdev_get_by_dev failed"); + return PTR_ERR(bdev); + } + + *device_size = i_size_read(bdev->bd_inode); + *device_size >>= SECTOR_SHIFT; + + DMINFO("blkdev size in sectors: %llu", *device_size); + blkdev_put(bdev, FMODE_READ); + return 0; +} + static struct android_metadata *extract_metadata(dev_t dev, struct fec_header *fec) { @@ -337,7 +371,7 @@ static struct android_metadata *extract_metadata(dev_t dev, bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); - if (IS_ERR(bdev)) { + if (IS_ERR_OR_NULL(bdev)) { DMERR("blkdev_get_by_dev failed"); return ERR_CAST(bdev); } @@ -632,12 +666,13 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; + u64 device_size; if (argc == 1) { /* Use the default keyid */ if (default_verity_key_id()) key_id = veritykeyid; - else { + else if (!is_eng()) { DMERR("veritykeyid= is not set"); handle_error(); return -EINVAL; @@ -650,7 +685,6 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - strreplace(key_id, '#', ' '); target_device = argv[0]; dev = name_to_dev_t(target_device); @@ -660,6 +694,26 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } + if (is_eng()) { + err = find_size(dev, &device_size); + if (err) { + DMERR("error finding bdev size"); + handle_error(); + return err; + } + + ti->len = device_size; + err = add_as_linear_device(ti, target_device); + if (err) { + handle_error(); + return err; + } + verity_enabled = false; + return 0; + } + + strreplace(key_id, '#', ' '); + DMINFO("key:%s dev:%s", key_id, target_device); if (extract_fec_header(dev, &fec, &ecc)) { diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 43655ee0f813..782e1c815c67 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -26,6 +26,7 @@ #define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE) #define VERITY_TABLE_ARGS 10 #define VERITY_COMMANDLINE_PARAM_LENGTH 20 +#define BUILD_VARIANT 20 /* * : is the format for the identifier. From f74284f6c25dca79c052c3a1e1bd3fc58de4d4a0 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 27 Jun 2016 16:25:55 -0700 Subject: [PATCH 0196/3220] ANDROID: dm: allow adb disable-verity only in userdebug adb disable-verity was allowed when the phone is in the unlocked state. Since the driver is now aware of the build variant, honor "adb disable-verity" only in userdebug builds. (Cherry-picked from https://partner-android-review.git.corp.google.com/#/c/622117) BUG: 29276559 Signed-off-by: Badhri Jagan Sridharan Change-Id: I7ce9f38d8c7a62361392c5a8ccebb288f8a3a2ea --- drivers/md/dm-android-verity.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index e1a8e284e7e4..999e75bf2ba0 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -109,6 +109,14 @@ static inline bool is_eng(void) return !strncmp(buildvariant, typeeng, sizeof(typeeng)); } +static inline bool is_userdebug(void) +{ + static const char typeuserdebug[] = "userdebug"; + + return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug)); +} + + static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) { @@ -499,19 +507,6 @@ const char *find_dt_value(const char *name) return value; } -static bool is_unlocked(void) -{ - static const char unlocked[] = "orange"; - static const char verified_boot_prop[] = "verifiedbootstate"; - const char *value; - - value = find_dt_value(verified_boot_prop); - if (!value) - value = verifiedbootstate; - - return !strncmp(value, unlocked, sizeof(unlocked) - 1); -} - static int verity_mode(void) { static const char enforcing[] = "enforcing"; @@ -531,7 +526,7 @@ static int verify_header(struct android_metadata_header *header) { int retval = -EINVAL; - if (is_unlocked() && le32_to_cpu(header->magic_number) == + if (is_userdebug() && le32_to_cpu(header->magic_number) == VERITY_METADATA_MAGIC_DISABLE) { retval = VERITY_STATE_DISABLE; return retval; From ad2f6cf0be78107a2bf435b8a21f801658935719 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Wed, 6 Jul 2016 17:16:19 -0700 Subject: [PATCH 0197/3220] ANDROID: dm: android-verity: Verify header before fetching table Move header validation logic before reading the verity_table as an invalid header implies the table is invalid as well. (Cherry-picked from: https://partner-android-review.git.corp.google.com/#/c/625203) BUG: 29940612 Signed-off-by: Badhri Jagan Sridharan Change-Id: Ib34d25c0854202f3e70df0a6d0ef1d96f0250c8e --- drivers/md/dm-android-verity.c | 140 +++++++++++++++++---------------- 1 file changed, 71 insertions(+), 69 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 999e75bf2ba0..1f4eb099209d 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -365,12 +365,38 @@ static int find_size(dev_t dev, u64 *device_size) return 0; } -static struct android_metadata *extract_metadata(dev_t dev, - struct fec_header *fec) +static int verify_header(struct android_metadata_header *header) +{ + int retval = -EINVAL; + + if (is_userdebug() && le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE) + return VERITY_STATE_DISABLE; + + if (!(le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_NUMBER) || + (le32_to_cpu(header->magic_number) == + VERITY_METADATA_MAGIC_DISABLE)) { + DMERR("Incorrect magic number"); + return retval; + } + + if (le32_to_cpu(header->protocol_version) != + VERITY_METADATA_VERSION) { + DMERR("Unsupported version %u", + le32_to_cpu(header->protocol_version)); + return retval; + } + + return 0; +} + +static int extract_metadata(dev_t dev, struct fec_header *fec, + struct android_metadata **metadata, + bool *verity_enabled) { struct block_device *bdev; struct android_metadata_header *header; - struct android_metadata *uninitialized_var(metadata); int i; u32 table_length, copy_length, offset; u64 metadata_offset; @@ -381,7 +407,7 @@ static struct android_metadata *extract_metadata(dev_t dev, if (IS_ERR_OR_NULL(bdev)) { DMERR("blkdev_get_by_dev failed"); - return ERR_CAST(bdev); + return -ENODEV; } find_metadata_offset(fec, bdev, &metadata_offset); @@ -399,7 +425,6 @@ static struct android_metadata *extract_metadata(dev_t dev, (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE); if (err) { DMERR("Error while reading verity metadata"); - metadata = ERR_PTR(err); goto blkdev_release; } @@ -418,24 +443,42 @@ static struct android_metadata *extract_metadata(dev_t dev, le32_to_cpu(header->protocol_version), le32_to_cpu(header->table_length)); - metadata = kzalloc(sizeof(*metadata), GFP_KERNEL); - if (!metadata) { + err = verify_header(header); + + if (err == VERITY_STATE_DISABLE) { + DMERR("Mounting root with verity disabled"); + *verity_enabled = false; + /* we would still have to read the metadata to figure out + * the data blocks size. Or may be could map the entire + * partition similar to mounting the device. + * + * Reset error as well as the verity_enabled flag is changed. + */ + err = 0; + } else if (err) + goto free_header; + + *metadata = kzalloc(sizeof(**metadata), GFP_KERNEL); + if (!*metadata) { DMERR("kzalloc for metadata failed"); err = -ENOMEM; goto free_header; } - metadata->header = header; + (*metadata)->header = header; table_length = le32_to_cpu(header->table_length); if (table_length == 0 || table_length > (VERITY_METADATA_SIZE - - sizeof(struct android_metadata_header))) + sizeof(struct android_metadata_header))) { + DMERR("table_length too long"); + err = -EINVAL; goto free_metadata; + } - metadata->verity_table = kzalloc(table_length + 1, GFP_KERNEL); + (*metadata)->verity_table = kzalloc(table_length + 1, GFP_KERNEL); - if (!metadata->verity_table) { + if (!(*metadata)->verity_table) { DMERR("kzalloc verity_table failed"); err = -ENOMEM; goto free_metadata; @@ -443,13 +486,15 @@ static struct android_metadata *extract_metadata(dev_t dev, if (sizeof(struct android_metadata_header) + table_length <= PAGE_SIZE) { - memcpy(metadata->verity_table, page_address(payload.page_io[0]) + memcpy((*metadata)->verity_table, + page_address(payload.page_io[0]) + sizeof(struct android_metadata_header), table_length); } else { copy_length = PAGE_SIZE - sizeof(struct android_metadata_header); - memcpy(metadata->verity_table, page_address(payload.page_io[0]) + memcpy((*metadata)->verity_table, + page_address(payload.page_io[0]) + sizeof(struct android_metadata_header), copy_length); table_length -= copy_length; @@ -457,13 +502,13 @@ static struct android_metadata *extract_metadata(dev_t dev, i = 1; while (table_length != 0) { if (table_length > PAGE_SIZE) { - memcpy(metadata->verity_table + offset, + memcpy((*metadata)->verity_table + offset, page_address(payload.page_io[i]), PAGE_SIZE); offset += PAGE_SIZE; table_length -= PAGE_SIZE; } else { - memcpy(metadata->verity_table + offset, + memcpy((*metadata)->verity_table + offset, page_address(payload.page_io[i]), table_length); table_length = 0; @@ -471,25 +516,23 @@ static struct android_metadata *extract_metadata(dev_t dev, i++; } } - metadata->verity_table[table_length] = '\0'; + (*metadata)->verity_table[table_length] = '\0'; + DMINFO("verity_table: %s", (*metadata)->verity_table); goto free_payload; free_metadata: - kfree(metadata); + kfree(*metadata); free_header: kfree(header); - metadata = ERR_PTR(err); free_payload: for (i = 0; i < payload.number_of_pages; i++) if (payload.page_io[i]) __free_page(payload.page_io[i]); kfree(payload.page_io); - - DMINFO("verity_table: %s", metadata->verity_table); blkdev_release: blkdev_put(bdev, FMODE_READ); - return metadata; + return err; } /* helper functions to extract properties from dts */ @@ -522,34 +565,6 @@ static int verity_mode(void) return DM_VERITY_MODE_EIO; } -static int verify_header(struct android_metadata_header *header) -{ - int retval = -EINVAL; - - if (is_userdebug() && le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_DISABLE) { - retval = VERITY_STATE_DISABLE; - return retval; - } - - if (!(le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_NUMBER) || - (le32_to_cpu(header->magic_number) == - VERITY_METADATA_MAGIC_DISABLE)) { - DMERR("Incorrect magic number"); - return retval; - } - - if (le32_to_cpu(header->protocol_version) != - VERITY_METADATA_VERSION) { - DMERR("Unsupported version %u", - le32_to_cpu(header->protocol_version)); - return retval; - } - - return 0; -} - static int verify_verity_signature(char *key_id, struct android_metadata *metadata) { @@ -649,7 +664,7 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { dev_t uninitialized_var(dev); - struct android_metadata *uninitialized_var(metadata); + struct android_metadata *metadata = NULL; int err = 0, i, mode; char *key_id, *table_ptr, dummy, *target_device, *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS]; @@ -717,26 +732,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - metadata = extract_metadata(dev, &fec); + err = extract_metadata(dev, &fec, &metadata, &verity_enabled); - if (IS_ERR(metadata)) { + if (err) { DMERR("Error while extracting metadata"); handle_error(); - return -EINVAL; - } - - err = verify_header(metadata->header); - - if (err == VERITY_STATE_DISABLE) { - DMERR("Mounting root with verity disabled"); - verity_enabled = false; - /* we would still have to parse the args to figure out - * the data blocks size. Or may be could map the entire - * partition similar to mounting the device. - */ - } else if (err) { - DMERR("Verity header handle error"); - handle_error(); goto free_metadata; } @@ -869,8 +869,10 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } free_metadata: - kfree(metadata->header); - kfree(metadata->verity_table); + if (metadata) { + kfree(metadata->header); + kfree(metadata->verity_table); + } kfree(metadata); return err; } From edb97738f20194cb7eea8eeb81bac21f4a8cfb83 Mon Sep 17 00:00:00 2001 From: Jeremy Compostella Date: Tue, 10 May 2016 13:10:20 +0200 Subject: [PATCH 0198/3220] ANDROID: dm verity fec: pack the fec_header structure The fec_header structure is generated build time and stored on disk. The fec_header might be build on a 64 bits machine while it is read per a 32 bits device or the other way around. In such situations, the fec_header fields are not aligned as expected by the device and it fails to read the fec_header structure. This patch makes the fec_header packed. Change-Id: Idb84453e70cc11abd5ef3a0adfbb16f8b5feaf06 Signed-off-by: Jeremy Compostella --- drivers/md/dm-android-verity.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index 782e1c815c67..f43b02fbb475 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -72,9 +72,6 @@ * if fec is not present * */ -/* TODO: rearrange structure to reduce memory holes - * depends on userspace change. - */ struct fec_header { __le32 magic; __le32 version; @@ -83,7 +80,7 @@ struct fec_header { __le32 fec_size; __le64 inp_size; u8 hash[SHA256_DIGEST_SIZE]; -}; +} __attribute__((packed)); struct android_metadata_header { __le32 magic_number; From 3a400abdc57e047786d0b35d39617a794e2bb97e Mon Sep 17 00:00:00 2001 From: Joseph Lo Date: Mon, 22 Apr 2013 14:39:18 +0800 Subject: [PATCH 0199/3220] CHROMIUM: sched: update the average of nr_running Doing a Exponential moving average per nr_running++/-- does not guarantee a fixed sample rate which induces errors if there are lots of threads being enqueued/dequeued from the rq (Linpack mt). Instead of keeping track of the avg, the scheduler now keeps track of the integral of nr_running and allows the readers to perform filtering on top. Original-author: Sai Charan Gurrappadi Change-Id: Id946654f32fa8be0eaf9d8fa7c9a8039b5ef9fab Signed-off-by: Joseph Lo Signed-off-by: Andrew Bresticker Reviewed-on: https://chromium-review.googlesource.com/174694 Reviewed-on: https://chromium-review.googlesource.com/272853 [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- include/linux/sched.h | 3 +++ kernel/sched/core.c | 30 ++++++++++++++++++++++++++ kernel/sched/sched.h | 49 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b9b11b2dbabd..ee59abcab30d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -173,6 +173,9 @@ extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); +#ifdef CONFIG_CPU_QUIET +extern u64 nr_running_integral(unsigned int cpu); +#endif extern void calc_global_load(unsigned long ticks); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9e25e68747d7..42c45ccc1f62 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2768,6 +2768,36 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +#ifdef CONFIG_CPU_QUIET +u64 nr_running_integral(unsigned int cpu) +{ + unsigned int seqcnt; + u64 integral; + struct rq *q; + + if (cpu >= nr_cpu_ids) + return 0; + + q = cpu_rq(cpu); + + /* + * Update average to avoid reading stalled value if there were + * no run-queue changes for a long time. On the other hand if + * the changes are happening right now, just read current value + * directly. + */ + + seqcnt = read_seqcount_begin(&q->ave_seqcnt); + integral = do_nr_running_integral(q); + if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) { + read_seqcount_begin(&q->ave_seqcnt); + integral = q->nr_running_integral; + } + + return integral; +} +#endif + void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { struct rq *rq = this_rq(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index db3f3db9849c..5713513adb9d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -594,6 +594,14 @@ struct rq { #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif + +#ifdef CONFIG_CPU_QUIET + /* time-based average load */ + u64 nr_last_stamp; + u64 nr_running_integral; + seqcount_t ave_seqcnt; +#endif + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -1353,7 +1361,7 @@ extern void init_entity_runnable_average(struct sched_entity *se); extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); -static inline void add_nr_running(struct rq *rq, unsigned count) +static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; @@ -1381,11 +1389,48 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } } -static inline void sub_nr_running(struct rq *rq, unsigned count) +static inline void __sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; } +#ifdef CONFIG_CPU_QUIET +#define NR_AVE_SCALE(x) ((x) << FSHIFT) +static inline u64 do_nr_running_integral(struct rq *rq) +{ + s64 nr, deltax; + u64 nr_running_integral = rq->nr_running_integral; + + deltax = rq->clock_task - rq->nr_last_stamp; + nr = NR_AVE_SCALE(rq->nr_running); + + nr_running_integral += nr * deltax; + + return nr_running_integral; +} + +static inline void add_nr_running(struct rq *rq, unsigned count) +{ + write_seqcount_begin(&rq->ave_seqcnt); + rq->nr_running_integral = do_nr_running_integral(rq); + rq->nr_last_stamp = rq->clock_task; + __add_nr_running(rq, count); + write_seqcount_end(&rq->ave_seqcnt); +} + +static inline void sub_nr_running(struct rq *rq, unsigned count) +{ + write_seqcount_begin(&rq->ave_seqcnt); + rq->nr_running_integral = do_nr_running_integral(rq); + rq->nr_last_stamp = rq->clock_task; + __sub_nr_running(rq, count); + write_seqcount_end(&rq->ave_seqcnt); +} +#else +#define add_nr_running __add_nr_running +#define sub_nr_running __sub_nr_running +#endif + static inline void rq_last_tick_reset(struct rq *rq) { #ifdef CONFIG_NO_HZ_FULL From 687fa2dcd285d3344fe89cd6a3c992fcaba13b6f Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 17:16:41 +0000 Subject: [PATCH 0200/3220] arm: topology: Define TC2 energy and provide it to the scheduler This patch is only here to be able to test provisioning of energy related data from an arch topology shim layer to the scheduler. Since there is no code today which deals with extracting energy related data from the dtb or acpi, and process it in the topology shim layer, the content of the sched_group_energy structures as well as the idle_state and capacity_state arrays are hard-coded here. This patch defines the sched_group_energy structure as well as the idle_state and capacity_state array for the cluster (relates to sched groups (sgs) in DIE sched domain level) and for the core (relates to sgs in MC sd level) for a Cortex A7 as well as for a Cortex A15. It further provides related implementations of the sched_domain_energy_f functions (cpu_cluster_energy() and cpu_core_energy()). To be able to propagate this information from the topology shim layer to the scheduler, the elements of the arm_topology[] table have been provisioned with the appropriate sched_domain_energy_f functions. Change-Id: I8c014bbd04f6a1d57892be9bfa16affe07948dcf cc: Russell King Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 126 ++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 0308342def8c..f5941004efba 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -287,6 +287,127 @@ void store_cpu_topology(unsigned int cpuid) cpu_topology[cpuid].socket_id, mpidr); } +/* + * ARM TC2 specific energy cost model data. There are no unit requirements for + * the data. Data can be normalized to any reference point, but the + * normalization must be consistent. That is, one bogo-joule/watt must be the + * same quantity for all data, but we don't care what it is. + */ +static struct idle_state idle_states_cluster_a7[] = { + { .power = 25 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 25 }, /* WFI */ + { .power = 10 }, /* cluster-sleep-l */ + }; + +static struct idle_state idle_states_cluster_a15[] = { + { .power = 70 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 70 }, /* WFI */ + { .power = 25 }, /* cluster-sleep-b */ + }; + +static struct capacity_state cap_states_cluster_a7[] = { + /* Cluster only power */ + { .cap = 150, .power = 2967, }, /* 350 MHz */ + { .cap = 172, .power = 2792, }, /* 400 MHz */ + { .cap = 215, .power = 2810, }, /* 500 MHz */ + { .cap = 258, .power = 2815, }, /* 600 MHz */ + { .cap = 301, .power = 2919, }, /* 700 MHz */ + { .cap = 344, .power = 2847, }, /* 800 MHz */ + { .cap = 387, .power = 3917, }, /* 900 MHz */ + { .cap = 430, .power = 4905, }, /* 1000 MHz */ + }; + +static struct capacity_state cap_states_cluster_a15[] = { + /* Cluster only power */ + { .cap = 426, .power = 7920, }, /* 500 MHz */ + { .cap = 512, .power = 8165, }, /* 600 MHz */ + { .cap = 597, .power = 8172, }, /* 700 MHz */ + { .cap = 682, .power = 8195, }, /* 800 MHz */ + { .cap = 768, .power = 8265, }, /* 900 MHz */ + { .cap = 853, .power = 8446, }, /* 1000 MHz */ + { .cap = 938, .power = 11426, }, /* 1100 MHz */ + { .cap = 1024, .power = 15200, }, /* 1200 MHz */ + }; + +static struct sched_group_energy energy_cluster_a7 = { + .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7), + .idle_states = idle_states_cluster_a7, + .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a7), + .cap_states = cap_states_cluster_a7, +}; + +static struct sched_group_energy energy_cluster_a15 = { + .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15), + .idle_states = idle_states_cluster_a15, + .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a15), + .cap_states = cap_states_cluster_a15, +}; + +static struct idle_state idle_states_core_a7[] = { + { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */ + { .power = 0 }, /* WFI */ + { .power = 0 }, /* cluster-sleep-l */ + }; + +static struct idle_state idle_states_core_a15[] = { + { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */ + { .power = 0 }, /* WFI */ + { .power = 0 }, /* cluster-sleep-b */ + }; + +static struct capacity_state cap_states_core_a7[] = { + /* Power per cpu */ + { .cap = 150, .power = 187, }, /* 350 MHz */ + { .cap = 172, .power = 275, }, /* 400 MHz */ + { .cap = 215, .power = 334, }, /* 500 MHz */ + { .cap = 258, .power = 407, }, /* 600 MHz */ + { .cap = 301, .power = 447, }, /* 700 MHz */ + { .cap = 344, .power = 549, }, /* 800 MHz */ + { .cap = 387, .power = 761, }, /* 900 MHz */ + { .cap = 430, .power = 1024, }, /* 1000 MHz */ + }; + +static struct capacity_state cap_states_core_a15[] = { + /* Power per cpu */ + { .cap = 426, .power = 2021, }, /* 500 MHz */ + { .cap = 512, .power = 2312, }, /* 600 MHz */ + { .cap = 597, .power = 2756, }, /* 700 MHz */ + { .cap = 682, .power = 3125, }, /* 800 MHz */ + { .cap = 768, .power = 3524, }, /* 900 MHz */ + { .cap = 853, .power = 3846, }, /* 1000 MHz */ + { .cap = 938, .power = 5177, }, /* 1100 MHz */ + { .cap = 1024, .power = 6997, }, /* 1200 MHz */ + }; + +static struct sched_group_energy energy_core_a7 = { + .nr_idle_states = ARRAY_SIZE(idle_states_core_a7), + .idle_states = idle_states_core_a7, + .nr_cap_states = ARRAY_SIZE(cap_states_core_a7), + .cap_states = cap_states_core_a7, +}; + +static struct sched_group_energy energy_core_a15 = { + .nr_idle_states = ARRAY_SIZE(idle_states_core_a15), + .idle_states = idle_states_core_a15, + .nr_cap_states = ARRAY_SIZE(cap_states_core_a15), + .cap_states = cap_states_core_a15, +}; + +/* sd energy functions */ +static inline +const struct sched_group_energy * const cpu_cluster_energy(int cpu) +{ + return cpu_topology[cpu].socket_id ? &energy_cluster_a7 : + &energy_cluster_a15; +} + +static inline +const struct sched_group_energy * const cpu_core_energy(int cpu) +{ + return cpu_topology[cpu].socket_id ? &energy_core_a7 : + &energy_core_a15; +} + static inline int cpu_corepower_flags(void) { return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ @@ -295,10 +416,9 @@ static inline int cpu_corepower_flags(void) static struct sched_domain_topology_level arm_topology[] = { #ifdef CONFIG_SCHED_MC - { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) }, - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, { NULL, }, }; From 962b7c10ab3a09cc30114d5ddf01abd9e6979278 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:43:37 +0000 Subject: [PATCH 0201/3220] DEBUG: sched/tune: add tracepoint for task boost signal Change-Id: I545d3bf5569fc41c0fa70f51dff9a19c11d532ee Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 30 ++++++++++++++++++++++++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 32 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 7ec9dcfc701a..606509500446 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -764,6 +764,36 @@ TRACE_EVENT(sched_tune_boostgroup_update, __entry->cpu, __entry->variation, __entry->max_boost) ); +/* + * Tracepoint for accounting task boosted utilization + */ +TRACE_EVENT(sched_boost_task, + + TP_PROTO(struct task_struct *tsk, unsigned long util, unsigned long margin), + + TP_ARGS(tsk, util, margin), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( unsigned long, util ) + __field( unsigned long, margin ) + + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("comm=%s pid=%d util=%lu margin=%lu", + __entry->comm, __entry->pid, + __entry->util, + __entry->margin) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bc56e0ea1ab..3a1fd4d3a860 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5283,6 +5283,8 @@ boosted_task_util(struct task_struct *task) unsigned long util = task_util(task); unsigned long margin = schedtune_task_margin(task); + trace_sched_boost_task(task, util, margin); + return util + margin; } From 4525aa343e8b252dd365e603ad141639989d547c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:47:21 +0000 Subject: [PATCH 0202/3220] DEBUG: sched/tune: add tracepoint for energy_diff() values Change-Id: Id8fafbd85f6d81248f322e073ee790a7ceec0bf7 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 57 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++++++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 606509500446..13fb31cebf62 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -794,6 +794,63 @@ TRACE_EVENT(sched_boost_task, __entry->margin) ); +/* + * Tracepoint for accounting sched group energy + */ +TRACE_EVENT(sched_energy_diff, + + TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta, + int nrgb, int nrga, int nrgd, int capb, int capa, int capd, + int nrgn, int nrgp), + + TP_ARGS(tsk, scpu, dcpu, udelta, + nrgb, nrga, nrgd, capb, capa, capd, + nrgn, nrgp), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, scpu ) + __field( int, dcpu ) + __field( int, udelta ) + __field( int, nrgb ) + __field( int, nrga ) + __field( int, nrgd ) + __field( int, capb ) + __field( int, capa ) + __field( int, capd ) + __field( int, nrgn ) + __field( int, nrgp ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->scpu = scpu; + __entry->dcpu = dcpu; + __entry->udelta = udelta; + __entry->nrgb = nrgb; + __entry->nrga = nrga; + __entry->nrgd = nrgd; + __entry->capb = capb; + __entry->capa = capa; + __entry->capd = capd; + __entry->nrgn = nrgn; + __entry->nrgp = nrgp; + ), + + TP_printk("pid=%d comm=%s " + "src_cpu=%d dst_cpu=%d usage_delta=%d " + "nrg_before=%d nrg_after=%d nrg_diff=%d " + "cap_before=%d cap_after=%d cap_delta=%d " + "nrg_delta=%d nrg_payoff=%d", + __entry->pid, __entry->comm, + __entry->scpu, __entry->dcpu, __entry->udelta, + __entry->nrgb, __entry->nrga, __entry->nrgd, + __entry->capb, __entry->capa, __entry->capd, + __entry->nrgn, __entry->nrgp) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a1fd4d3a860..69cb6fb6793e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4998,6 +4998,7 @@ static int energy_diff(struct energy_env *eenv) struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; + int result; struct energy_env eenv_before = { .util_delta = 0, @@ -5041,7 +5042,15 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - return energy_diff_evaluate(eenv); + result = energy_diff_evaluate(eenv); + + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); + + return result; } /* From ae54c7741f30a963d84134e92fd452b8252946a8 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 20 Jan 2016 14:06:05 +0000 Subject: [PATCH 0203/3220] DEBUG: sched/tune: add tracepoint on P-E space filtering Change-Id: I31dfed67c0486713b88efb75df767329f2802e06 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 35 +++++++++++++++++++++++++++++++++++ kernel/sched/tune.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 13fb31cebf62..cb082a68cdd6 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -851,6 +851,41 @@ TRACE_EVENT(sched_energy_diff, __entry->nrgn, __entry->nrgp) ); +/* + * Tracepoint for schedtune_tasks_update + */ +TRACE_EVENT(sched_tune_filter, + + TP_PROTO(int nrg_delta, int cap_delta, + int nrg_gain, int cap_gain, + int payoff, int region), + + TP_ARGS(nrg_delta, cap_delta, nrg_gain, cap_gain, payoff, region), + + TP_STRUCT__entry( + __field( int, nrg_delta ) + __field( int, cap_delta ) + __field( int, nrg_gain ) + __field( int, cap_gain ) + __field( int, payoff ) + __field( int, region ) + ), + + TP_fast_assign( + __entry->nrg_delta = nrg_delta; + __entry->cap_delta = cap_delta; + __entry->nrg_gain = nrg_gain; + __entry->cap_gain = cap_gain; + __entry->payoff = payoff; + __entry->region = region; + ), + + TP_printk("nrg_delta=%d cap_delta=%d nrg_gain=%d cap_gain=%d payoff=%d region=%d", + __entry->nrg_delta, __entry->cap_delta, + __entry->nrg_gain, __entry->cap_gain, + __entry->payoff, __entry->region) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 7a434f2394e7..b40d40dc3c49 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -70,6 +70,13 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, */ payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; + + trace_sched_tune_filter( + nrg_delta, cap_delta, + threshold_gains[perf_boost_idx].nrg_gain, + threshold_gains[perf_boost_idx].cap_gain, + payoff, 8); + return payoff; } @@ -84,6 +91,13 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, */ payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; + + trace_sched_tune_filter( + nrg_delta, cap_delta, + threshold_gains[perf_constrain_idx].nrg_gain, + threshold_gains[perf_constrain_idx].cap_gain, + payoff, 6); + return payoff; } @@ -155,12 +169,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, int perf_constrain_idx; /* Optimal (O) region */ - if (nrg_delta < 0 && cap_delta > 0) + if (nrg_delta < 0 && cap_delta > 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); return INT_MAX; + } /* Suboptimal (S) region */ - if (nrg_delta > 0 && cap_delta < 0) + if (nrg_delta > 0 && cap_delta < 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); return -INT_MAX; + } /* Get task specific perf Boost/Constraints indexes */ rcu_read_lock(); @@ -531,12 +549,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, struct task_struct *task) { /* Optimal (O) region */ - if (nrg_delta < 0 && cap_delta > 0) + if (nrg_delta < 0 && cap_delta > 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); return INT_MAX; + } /* Suboptimal (S) region */ - if (nrg_delta > 0 && cap_delta < 0) + if (nrg_delta > 0 && cap_delta < 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); return -INT_MAX; + } return __schedtune_accept_deltas(nrg_delta, cap_delta, perf_boost_idx, perf_constrain_idx); From 765c2ab363a02a71f9ef4c55a7f598e3dd401e0b Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 22 Jul 2016 11:35:59 +0100 Subject: [PATCH 0204/3220] FIXUP: sched: fix build for non-SMP target Currently the build for a single-core (e.g. user-mode) Linux is broken and this configuration is required (at least) to run some network tests. The main issues for the current code support on single-core systems are: 1. {se,rq}::sched_avg is not available nor maintained for !SMP systems This means that load and utilisation signals are NOT available in single core systems. All the EAS code depends on these signals. 2. sched_group_energy is also SMP dependant. Again this means that all the EAS setup and preparation code (energyn model initialization) has to be properly guarded/disabled for !SMP systems. 3. SchedFreq depends on utilization signal, which is not available on !SMP systems. 4. SchedTune is useless on unicore systems if SchedFreq is not available. 5. WALT machinery is not required on single-core systems. This patch addresses all these issues by enforcing some constraints for single-core systems: a) WALT, SchedTune and SchedTune are now dependant on SMP b) The default governor for !SMP systems is INTERACTIVE c) The energy model initialisation/build functions are d) Other minor code re-arrangements and CONFIG_SMP guarding to enable single core builds. Signed-off-by: Patrick Bellasi --- drivers/cpufreq/Kconfig | 1 + include/linux/sched_energy.h | 8 ++++++++ include/trace/events/sched.h | 4 ++++ init/Kconfig | 1 + kernel/sched/Makefile | 4 ++-- kernel/sched/fair.c | 33 +++++++++++++++++++++++++++++---- kernel/sched/sched.h | 3 +-- 7 files changed, 46 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index af523c539af0..a2d63d86bb70 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -195,6 +195,7 @@ config CPU_FREQ_GOV_CONSERVATIVE config CPU_FREQ_GOV_SCHED bool "'sched' cpufreq governor" depends on CPU_FREQ + depends on SMP select CPU_FREQ_GOV_COMMON help 'sched' - this governor scales cpu frequency from the diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h index a3f1627ac609..1daf3e1f98a7 100644 --- a/include/linux/sched_energy.h +++ b/include/linux/sched_energy.h @@ -29,8 +29,16 @@ #define for_each_possible_sd_level(level) \ for (level = 0; level < NR_SD_LEVELS; level++) +#ifdef CONFIG_SMP + extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; void init_sched_energy_costs(void); +#else + +#define init_sched_energy_costs() do { } while (0) + +#endif /* CONFIG_SMP */ + #endif diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index cb082a68cdd6..07ed295d0f23 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -587,6 +587,8 @@ TRACE_EVENT(sched_contrib_scale_f, __entry->cpu_scale_factor) ); +#ifdef CONFIG_SMP + /* * Tracepoint for accounting sched averages for tasks. */ @@ -886,6 +888,8 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +#endif /* CONFIG_SMP */ + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/init/Kconfig b/init/Kconfig index 5d9097e2b805..facb2b587d5b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1256,6 +1256,7 @@ config SCHED_AUTOGROUP config SCHED_TUNE bool "Boosting for CFS tasks (EXPERIMENTAL)" + depends on SMP help This option enables the system-wide support for task boosting. When this support is enabled a new sysctl interface is exposed to diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index c6a85f813dfd..174762d8695b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,9 +12,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 69cb6fb6793e..1b12b14756ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4150,8 +4150,14 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_SMP +static bool cpu_overutilized(int cpu); static inline unsigned long boosted_cpu_util(int cpu); +#else +#define boosted_cpu_util(cpu) cpu_util(cpu) +#endif +#ifdef CONFIG_SMP static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4164,8 +4170,7 @@ static void update_capacity_of(int cpu) req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); } - -static bool cpu_overutilized(int cpu); +#endif /* * The enqueue_task method is called before nr_running is @@ -4177,8 +4182,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; +#ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; int task_wakeup = flags & ENQUEUE_WAKEUP; +#endif for_each_sched_entity(se) { if (se->on_rq) @@ -4210,8 +4217,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) { + if (!se) add_nr_running(rq, 1); + +#ifdef CONFIG_SMP + + if (!se) { if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; @@ -4228,6 +4239,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -4285,8 +4298,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) { + if (!se) sub_nr_running(rq, 1); + +#ifdef CONFIG_SMP + + if (!se) { schedtune_dequeue_task(p, cpu_of(rq)); /* @@ -4304,6 +4321,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) set_cfs_cpu_capacity(cpu_of(rq), false, 0); } } + +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -5692,6 +5712,8 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } +#else +#define task_fits_max(p, cpu) true #endif /* CONFIG_SMP */ static unsigned long @@ -8716,10 +8738,13 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); +#ifdef CONFIG_SMP if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) rq->rd->overutilized = true; rq->misfit_task = !task_fits_max(curr, rq->cpu); +#endif + } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5713513adb9d..a06c19c48057 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1277,6 +1277,7 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); @@ -1359,8 +1360,6 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); - static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; From d753e92e19660a6adcc3b8e0076a6c6078e5f59d Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Mon, 1 Aug 2016 11:34:05 +0100 Subject: [PATCH 0205/3220] sched/cpufreq_sched: Consolidated update Contains: sched/cpufreq_sched: use shorter throttle for raising OPP Avoid cases where a brief drop in load causes a change to a low OPP for the full throttle period. Use a shorter throttle period for raising OPP than for lowering OPP. sched-freq: Fix handling of max/min frequency This reverts commit 9726142608f5b3bf5df4280243c9d324e692a510. Change-Id: Ia78095354f7ad9492f00deb509a2b45112361eda sched/cpufreq: Increasing throttle_down_nsec to 50ms Change-Id: I2d8969cf2a64fa719b9dd86f43f9dd14b1ff84fe sched-freq: make throttle times tunable Change-Id: I127879645367425b273441d7f0306bb15d5633cb Signed-off-by: Srinath Sridharan Signed-off-by: Todd Kjos Signed-off-by: Juri Lelli [jstultz: Fwdported to 4.4] Signed-off-by: John Stultz --- drivers/cpufreq/Kconfig | 2 +- kernel/sched/cpufreq_sched.c | 175 +++++++++++++++++++++++++++++++---- 2 files changed, 160 insertions(+), 17 deletions(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a2d63d86bb70..d423f51d3276 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -105,7 +105,7 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE config CPU_FREQ_DEFAULT_GOV_SCHED bool "sched" - select CPU_FREQ_GOV_SCHED + select CPU_FREQ_GOV_INTERACTIVE help Use the CPUfreq governor 'sched' as default. This scales cpu frequency using CPU utilization estimates from the diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index e1d208e101ed..3f8c67a3ea0f 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -19,7 +19,8 @@ #include "sched.h" -#define THROTTLE_NSEC 50000000 /* 50ms default */ +#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */ +#define THROTTLE_UP_NSEC 500000 /* 500us default */ struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; static bool __read_mostly cpufreq_driver_slow; @@ -33,8 +34,10 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); /** * gov_data - per-policy data internal to the governor - * @throttle: next throttling period expiry. Derived from throttle_nsec - * @throttle_nsec: throttle period length in nanoseconds + * @up_throttle: next throttling period expiry if increasing OPP + * @down_throttle: next throttling period expiry if decreasing OPP + * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP + * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP * @task: worker thread for dvfs transition that may block/sleep * @irq_work: callback used to wake up worker thread * @requested_freq: last frequency requested by the sched governor @@ -48,11 +51,14 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); * call down_write(policy->rwsem). */ struct gov_data { - ktime_t throttle; - unsigned int throttle_nsec; + ktime_t up_throttle; + ktime_t down_throttle; + unsigned int up_throttle_nsec; + unsigned int down_throttle_nsec; struct task_struct *task; struct irq_work irq_work; unsigned int requested_freq; + int max; }; static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, @@ -66,25 +72,29 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); - gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec); + gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec); up_write(&policy->rwsem); } -static bool finish_last_request(struct gov_data *gd) +static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq) { ktime_t now = ktime_get(); - if (ktime_after(now, gd->throttle)) + ktime_t throttle = gd->requested_freq < cur_freq ? + gd->down_throttle : gd->up_throttle; + + if (ktime_after(now, throttle)) return false; while (1) { - int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); + int usec_left = ktime_to_ns(ktime_sub(throttle, now)); usec_left /= NSEC_PER_USEC; trace_cpufreq_sched_throttled(usec_left); usleep_range(usec_left, usec_left + 100); now = ktime_get(); - if (ktime_after(now, gd->throttle)) + if (ktime_after(now, throttle)) return true; } } @@ -128,7 +138,7 @@ static int cpufreq_sched_thread(void *data) * if the frequency thread sleeps while waiting to be * unthrottled, start over to check for a newer request */ - if (finish_last_request(gd)) + if (finish_last_request(gd, policy->cur)) continue; last_request = new_request; cpufreq_sched_try_driver_target(policy, new_request); @@ -183,16 +193,21 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * gd->max >> SCHED_CAPACITY_SHIFT; if (cpufreq_frequency_table_target(policy, policy->freq_table, freq_new, CPUFREQ_RELATION_L, &index_new)) goto out; freq_new = policy->freq_table[index_new].frequency; + if (freq_new > policy->max) + freq_new = policy->max; + + if (freq_new < policy->min) + freq_new = policy->min; + trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, gd->requested_freq); - if (freq_new == gd->requested_freq) goto out; @@ -246,10 +261,17 @@ static inline void clear_sched_freq(void) static_key_slow_dec(&__sched_freq); } +static struct attribute_group sched_attr_group_gov_pol; +static struct attribute_group *get_sysfs_attr(void) +{ + return &sched_attr_group_gov_pol; +} + static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) { struct gov_data *gd; int cpu; + int rc; for_each_cpu(cpu, policy->cpus) memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, @@ -259,11 +281,20 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) if (!gd) return -ENOMEM; - gd->throttle_nsec = policy->cpuinfo.transition_latency ? + gd->up_throttle_nsec = policy->cpuinfo.transition_latency ? policy->cpuinfo.transition_latency : - THROTTLE_NSEC; + THROTTLE_UP_NSEC; + gd->down_throttle_nsec = THROTTLE_DOWN_NSEC; pr_debug("%s: throttle threshold = %u [ns]\n", - __func__, gd->throttle_nsec); + __func__, gd->up_throttle_nsec); + + gd->max = policy->max; + + rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + if (rc) { + pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); + goto err; + } if (cpufreq_driver_is_slow()) { cpufreq_driver_slow = true; @@ -301,6 +332,8 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) put_task_struct(gd->task); } + sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + policy->governor_data = NULL; kfree(gd); @@ -317,6 +350,32 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy) return 0; } +static void cpufreq_sched_limits(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + + pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", + policy->cpu, policy->min, policy->max, + policy->cur); + + if (!down_write_trylock(&policy->rwsem)) + return; + /* + * Need to keep track of highest max frequency for + * capacity calculations + */ + gd = policy->governor_data; + if (gd->max < policy->max) + gd->max = policy->max; + + if (policy->max < policy->cur) + __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); + else if (policy->min > policy->cur) + __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); + + up_write(&policy->rwsem); +} + static int cpufreq_sched_stop(struct cpufreq_policy *policy) { int cpu; @@ -340,11 +399,95 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy, case CPUFREQ_GOV_STOP: return cpufreq_sched_stop(policy); case CPUFREQ_GOV_LIMITS: + cpufreq_sched_limits(policy); break; } return 0; } +/* Tunables */ +static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf) +{ + return sprintf(buf, "%u\n", gd->up_throttle_nsec); +} + +static ssize_t store_up_throttle_nsec(struct gov_data *gd, + const char *buf, size_t count) +{ + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + gd->up_throttle_nsec = val; + return count; +} + +static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf) +{ + return sprintf(buf, "%u\n", gd->down_throttle_nsec); +} + +static ssize_t store_down_throttle_nsec(struct gov_data *gd, + const char *buf, size_t count) +{ + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + gd->down_throttle_nsec = val; + return count; +} + +/* + * Create show/store routines + * - sys: One governor instance for complete SYSTEM + * - pol: One governor instance per struct cpufreq_policy + */ +#define show_gov_pol_sys(file_name) \ +static ssize_t show_##file_name##_gov_pol \ +(struct cpufreq_policy *policy, char *buf) \ +{ \ + return show_##file_name(policy->governor_data, buf); \ +} + +#define store_gov_pol_sys(file_name) \ +static ssize_t store_##file_name##_gov_pol \ +(struct cpufreq_policy *policy, const char *buf, size_t count) \ +{ \ + return store_##file_name(policy->governor_data, buf, count); \ +} + +#define gov_pol_attr_rw(_name) \ + static struct freq_attr _name##_gov_pol = \ + __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol) + +#define show_store_gov_pol_sys(file_name) \ + show_gov_pol_sys(file_name); \ + store_gov_pol_sys(file_name) +#define tunable_handlers(file_name) \ + show_gov_pol_sys(file_name); \ + store_gov_pol_sys(file_name); \ + gov_pol_attr_rw(file_name) + +tunable_handlers(down_throttle_nsec); +tunable_handlers(up_throttle_nsec); + +/* Per policy governor instance */ +static struct attribute *sched_attributes_gov_pol[] = { + &up_throttle_nsec_gov_pol.attr, + &down_throttle_nsec_gov_pol.attr, + NULL, +}; + +static struct attribute_group sched_attr_group_gov_pol = { + .attrs = sched_attributes_gov_pol, + .name = "sched", +}; + #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED static #endif From 2e9abbc942e6211f9ac4dadb14debc39bb9d1418 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 14 Jul 2016 09:57:29 +0100 Subject: [PATCH 0206/3220] sched: EAS: take cstate into account when selecting idle core Introduce a new sysctl for this option, 'sched_cstate_aware'. When this is enabled, select_idle_sibling in CFS is modified to choose the idle CPU in the sibling group which has the lowest idle state index - idle state indexes are assumed to increase as sleep depth and hence wakeup latency increase. In this way, we attempt to minimise wakeup latency when an idle CPU is required. Signed-off-by: Srinath Sridharan Includes: sched: EAS: fix select_idle_sibling when sysctl_sched_cstate_aware is enabled, best_idle cpu will not be chosen in the original flow because it will goto done directly Bug: 30107557 Change-Id: Ie09c2e3960cafbb976f8d472747faefab3b4d6ac Signed-off-by: martin_liu --- include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 55 +++++++++++++++++++++++++++--------- kernel/sysctl.c | 7 +++++ 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4479e48c7712..7d021393b0da 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,7 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b12b14756ec..14c8527d7bcb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,6 +51,7 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_cstate_aware = 1; /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -5468,15 +5469,20 @@ static int select_idle_sibling(struct task_struct *p, int target) struct sched_domain *sd; struct sched_group *sg; int i = task_cpu(p); + int best_idle = -1; + int best_idle_cstate = -1; + int best_idle_capacity = INT_MAX; - if (idle_cpu(target)) - return target; + if (!sysctl_sched_cstate_aware) { + if (idle_cpu(target)) + return target; - /* - * If the prevous cpu is cache affine and idle, don't be stupid. - */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + /* + * If the prevous cpu is cache affine and idle, don't be stupid. + */ + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; + } /* * Otherwise, iterate the domains and find an elegible idle cpu. @@ -5489,18 +5495,41 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + if (sysctl_sched_cstate_aware) { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + struct rq *rq = cpu_rq(i); + int idle_idx = idle_get_state_idx(rq); + unsigned long new_usage = boosted_task_util(p); + unsigned long capacity_orig = capacity_orig_of(i); + if (new_usage > capacity_orig || !idle_cpu(i)) + goto next; - target = cpumask_first_and(sched_group_cpus(sg), + if (i == target && new_usage <= capacity_curr_of(target)) + return target; + + if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) { + best_idle = i; + best_idle_cstate = idle_idx; + best_idle_capacity = capacity_orig; + } + } + } else { + for_each_cpu(i, sched_group_cpus(sg)) { + if (i == target || !idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); - goto done; + goto done; + } next: sg = sg->next; } while (sg != sd->groups); } + if (best_idle > 0) + target = best_idle; + done: return target; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98c0f6ef01f1..cc0bf3ecdbda 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -303,6 +303,13 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_cstate_aware", + .data = &sysctl_sched_cstate_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_wakeup_granularity_ns", .data = &sysctl_sched_wakeup_granularity, From 4a5e890ec60d2e341fa560d8149997f5f0c48d67 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 29 Jul 2016 14:04:11 +0100 Subject: [PATCH 0207/3220] sched/fair: add tunable to force selection at cpu granularity EAS assumes that clusters with smaller capacity cores are more energy-efficient. This may not be true on non-big-little devices, so EAS can make incorrect cluster selections when finding a CPU to wake. The "sched_is_big_little" hint can be used to cause a cpu-based selection instead of cluster-based selection. This change incorporates the addition of the sync hint enable patch EAS did not honour synchronous wakeup hints, a new sysctl is created to ask EAS to use this information when selecting a CPU. The control is called "sched_sync_hint_enable". Also contains: EAS: sched/fair: for SMP bias toward idle core with capacity For SMP devices, on wakeup bias towards idle cores that have capacity vs busy devices that need a higher OPP eas: favor idle cpus for boosted tasks BUG: 29533997 BUG: 29512132 Change-Id: I0cc9a1b1b88fb52916f18bf2d25715bdc3634f9c Signed-off-by: Juri Lelli Signed-off-by: Srinath Sridharan eas/sched/fair: Favoring busy cpus with low OPPs BUG: 29533997 BUG: 29512132 Change-Id: I9305b3239698d64278db715a2e277ea0bb4ece79 Signed-off-by: Juri Lelli --- include/linux/sched/sysctl.h | 2 + kernel/sched/fair.c | 191 +++++++++++++++++++++++++++-------- kernel/sysctl.c | 14 +++ 3 files changed, 167 insertions(+), 40 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7d021393b0da..4883dcf3e1a9 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,6 +39,8 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_is_big_little; +extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 14c8527d7bcb..a22df661aaa6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,7 +51,10 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_is_big_little = 0; +unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; + /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -5534,7 +5537,97 @@ done: return target; } -static int energy_aware_wake_cpu(struct task_struct *p, int target) +static inline int find_best_target(struct task_struct *p) +{ + int i, boosted; + int target_cpu = -1; + int target_capacity = 0; + int backup_capacity = 0; + int idle_cpu = -1; + int best_idle_cstate = INT_MAX; + int backup_cpu = -1; + unsigned long task_util_boosted, new_util; + + /* + * Favor 1) busy cpu with most capacity at current OPP + * 2) idle_cpu with capacity at current OPP + * 3) busy cpu with capacity at higher OPP + */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boosted = schedtune_task_boost(p); +#else + boosted = 0; +#endif + task_util_boosted = boosted_task_util(p); + for_each_cpu(i, tsk_cpus_allowed(p)) { + int cur_capacity = capacity_curr_of(i); + struct rq *rq = cpu_rq(i); + int idle_idx = idle_get_state_idx(rq); + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + new_util = cpu_util(i) + task_util_boosted; + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + + if (new_util > capacity_orig_of(i)) + continue; + + /* + * For boosted tasks we favor idle cpus unconditionally to + * improve latency. + */ + if (idle_idx >= 0 && boosted) { + if (idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + idle_cpu = i; + } + continue; + } + + if (new_util < cur_capacity) { + if (cpu_rq(i)->nr_running) { + if (target_capacity == 0 || + target_capacity > cur_capacity) { + /* busy CPU with most capacity at current OPP */ + target_cpu = i; + target_capacity = cur_capacity; + } + } else if (!boosted) { + if (idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + idle_cpu = i; + } + } + } else if (backup_capacity == 0 || + backup_capacity > cur_capacity) { + /* first busy CPU with capacity at higher OPP */ + backup_capacity = cur_capacity; + backup_cpu = i; + } + } + + if (!boosted && target_cpu < 0) { + target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu; + } + + if (boosted && idle_cpu >= 0) + target_cpu = idle_cpu; + return target_cpu; +} + +static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) { struct sched_domain *sd; struct sched_group *sg, *sg_target; @@ -5542,6 +5635,14 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) int target_cpu = task_cpu(p); int i; + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + cpumask_t search_cpus; + cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); + if (cpumask_test_cpu(cpu, &search_cpus)) + return cpu; + } + sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); if (!sd) @@ -5550,50 +5651,60 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) sg = sd->groups; sg_target = sg; - /* - * Find group with sufficient capacity. We only get here if no cpu is - * overutilized. We may end up overutilizing a cpu by adding the task, - * but that should not be any worse than select_idle_sibling(). - * load_balance() should sort it out later as we get above the tipping - * point. - */ - do { - /* Assuming all cpus are the same in group */ - int max_cap_cpu = group_first_cpu(sg); + if (sysctl_sched_is_big_little) { /* - * Assume smaller max capacity means more energy-efficient. - * Ideally we should query the energy model for the right - * answer but it easily ends up in an exhaustive search. + * Find group with sufficient capacity. We only get here if no cpu is + * overutilized. We may end up overutilizing a cpu by adding the task, + * but that should not be any worse than select_idle_sibling(). + * load_balance() should sort it out later as we get above the tipping + * point. */ - if (capacity_of(max_cap_cpu) < target_max_cap && - task_fits_max(p, max_cap_cpu)) { - sg_target = sg; - target_max_cap = capacity_of(max_cap_cpu); - } - } while (sg = sg->next, sg != sd->groups); + do { + /* Assuming all cpus are the same in group */ + int max_cap_cpu = group_first_cpu(sg); - /* Find cpu with sufficient capacity */ - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * Assume smaller max capacity means more energy-efficient. + * Ideally we should query the energy model for the right + * answer but it easily ends up in an exhaustive search. + */ + if (capacity_of(max_cap_cpu) < target_max_cap && + task_fits_max(p, max_cap_cpu)) { + sg_target = sg; + target_max_cap = capacity_of(max_cap_cpu); + } + } while (sg = sg->next, sg != sd->groups); + + /* Find cpu with sufficient capacity */ + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + int new_util = cpu_util(i) + boosted_task_util(p); + + if (new_util > capacity_orig_of(i)) + continue; + + if (new_util < capacity_curr_of(i)) { + target_cpu = i; + if (cpu_rq(i)->nr_running) + break; + } + + /* cpu has capacity at higher OPP, keep it as fallback */ + if (target_cpu == task_cpu(p)) + target_cpu = i; + } + } else { /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. + * Find a cpu with sufficient capacity */ - int new_util = cpu_util(i) + boosted_task_util(p); - - if (new_util > capacity_orig_of(i)) - continue; - - if (new_util < capacity_curr_of(i)) { - target_cpu = i; - if (cpu_rq(i)->nr_running) - break; - } - - /* cpu has capacity at higher OPP, keep it as fallback */ - if (target_cpu == task_cpu(p)) - target_cpu = i; + int tmp_target = find_best_target(p); + if (tmp_target >= 0) + target_cpu = tmp_target; } if (target_cpu != task_cpu(p)) { @@ -5670,7 +5781,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (!sd) { if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) - new_cpu = energy_aware_wake_cpu(p, prev_cpu); + new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc0bf3ecdbda..4671761ae3c5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -303,6 +303,20 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_is_big_little", + .data = &sysctl_sched_is_big_little, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_sync_hint_enable", + .data = &sysctl_sched_sync_hint_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, From c50cc2299cb1a9ae769c955f5dd09a9648b1600e Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 11 Mar 2016 16:44:16 -0800 Subject: [PATCH 0208/3220] sched/fair: add tunable to set initial task load The choice of initial task load upon fork has a large influence on CPU and OPP selection when scheduler-driven DVFS is in use. Make this tuneable by adding a new sysctl "sched_initial_task_util". If the sched governor is not used, the default remains at SCHED_LOAD_SCALE Otherwise, the value from the sysctl is used. This defaults to 0. Signed-off-by: "Todd Kjos " --- include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 5 ++++- kernel/sysctl.c | 7 +++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4883dcf3e1a9..2834841c507e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,6 +41,7 @@ extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_is_big_little; extern unsigned int sysctl_sched_sync_hint_enable; +extern unsigned int sysctl_sched_initial_task_util; extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a22df661aaa6..88aca75a6659 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -53,6 +53,7 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_is_big_little = 0; unsigned int sysctl_sched_sync_hint_enable = 1; +unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; /* @@ -687,7 +688,9 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_avg = sched_freq() ? + sysctl_sched_initial_task_util : + scale_load_down(SCHED_LOAD_SCALE); sa->util_sum = sa->util_avg * LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4671761ae3c5..4457e107ec20 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -317,6 +317,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_initial_task_util", + .data = &sysctl_sched_initial_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, From 28e8cb961c2f71a89e391335a9304c3dd8b38d8f Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 16:39:27 +0100 Subject: [PATCH 0209/3220] FIX: sched/tune: update usage of boosted task utilisation on CPU selection A boosted task needs to be scheduled on a CPU which can grant a minimum capacity which is higher than its utilization. However, a task can be allocated on a CPU which already provides an utilization which is higher than the task boosted utilization itself. Moreover, with the previous approach a task 100% boosted is not fitting any CPU. This patch makes use of the boosted task utilization just as a threashold which defines the minimum capacity should be available on a CPU to host that task. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 88aca75a6659..9e1935b6e340 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5636,6 +5636,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) struct sched_group *sg, *sg_target; int target_max_cap = INT_MAX; int target_cpu = task_cpu(p); + unsigned long task_util_boosted, new_util; int i; if (sysctl_sched_sync_hint_enable && sync) { @@ -5679,6 +5680,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) } } while (sg = sg->next, sg != sd->groups); + task_util_boosted = boosted_task_util(p); /* Find cpu with sufficient capacity */ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { /* @@ -5686,8 +5688,13 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - int new_util = cpu_util(i) + boosted_task_util(p); + new_util = cpu_util(i) + task_util_boosted; + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ if (new_util > capacity_orig_of(i)) continue; From 6ba071d89dd72b080b9f0e4abf587cad99d5320b Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:45:57 +0100 Subject: [PATCH 0210/3220] FIX: sched/tune: move schedtune_nornalize_energy into fair.c The energy normalization function is required to get the proper values for the P-E space filtering function to work. That normalization is part of the hot wakeup path and currently implemented with a function call. Moving the normalization function into fair.c allows the compiler to further optimize that code by reducing overheads in the wakeup hot path. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/fair.c | 121 ++++++++++++++++++++++++++++---------------- kernel/sched/tune.c | 42 +-------------- kernel/sched/tune.h | 12 ++++- 3 files changed, 91 insertions(+), 84 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9e1935b6e340..2b23dfefe6f1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4975,44 +4975,6 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } -#ifdef CONFIG_SCHED_TUNE -static int energy_diff_evaluate(struct energy_env *eenv) -{ - unsigned int boost; - int nrg_delta; - - /* Return energy diff when boost margin is 0 */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(eenv->task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif - if (boost == 0) - return eenv->nrg.diff; - - /* Compute normalized energy diff */ - nrg_delta = schedtune_normalize_energy(eenv->nrg.diff); - eenv->nrg.delta = nrg_delta; - - eenv->payoff = schedtune_accept_deltas( - eenv->nrg.delta, - eenv->cap.delta, - eenv->task); - - /* - * When SchedTune is enabled, the energy_diff() function will return - * the computed energy payoff value. Since the energy_diff() return - * value is expected to be negative by its callers, this evaluation - * function return a negative value each time the evaluation return a - * positive payoff, which is the condition for the acceptance of - * a scheduling decision - */ - return -eenv->payoff; -} -#else /* CONFIG_SCHED_TUNE */ -#define energy_diff_evaluate(eenv) eenv->nrg.diff -#endif - /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -5020,12 +4982,11 @@ static int energy_diff_evaluate(struct energy_env *eenv) * utilization is removed from or added to the system (e.g. task wake-up). If * both are specified, the utilization is migrated. */ -static int energy_diff(struct energy_env *eenv) +static inline int __energy_diff(struct energy_env *eenv) { struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; - int result; struct energy_env eenv_before = { .util_delta = 0, @@ -5069,17 +5030,91 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - result = energy_diff_evaluate(eenv); - trace_sched_energy_diff(eenv->task, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); - return result; + return eenv->nrg.diff; } +#ifdef CONFIG_SCHED_TUNE + +struct target_nrg schedtune_target_nrg; + +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * corresponding to the specified energy variation. + */ +static inline int +normalize_energy(int energy_diff) +{ + u32 normalized_nrg; +#ifdef CONFIG_SCHED_DEBUG + int max_delta; + + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_LOAD_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +static inline int +energy_diff(struct energy_env *eenv) +{ + unsigned int boost; + int nrg_delta; + + /* Conpute "absolute" energy diff */ + __energy_diff(eenv); + + /* Return energy diff when boost margin is 0 */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(eenv->task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff(eenv) __energy_diff(eenv) +#endif + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index b40d40dc3c49..8ca8db2de818 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -3,24 +3,17 @@ #include #include #include -#include #include #include #include #include "sched.h" +#include "tune.h" unsigned int sysctl_sched_cfs_boost __read_mostly; -/* - * System energy normalization constants - */ -static struct target_nrg { - unsigned long min_power; - unsigned long max_power; - struct reciprocal_value rdiv; -} schedtune_target_nrg; +extern struct target_nrg schedtune_target_nrg; /* Performance Boost region (B) threshold params */ static int perf_boost_idx; @@ -587,37 +580,6 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, return 0; } -/* - * System energy normalization - * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], - * corresponding to the specified energy variation. - */ -int -schedtune_normalize_energy(int energy_diff) -{ - u32 normalized_nrg; - int max_delta; - -#ifdef CONFIG_SCHED_DEBUG - /* Check for boundaries */ - max_delta = schedtune_target_nrg.max_power; - max_delta -= schedtune_target_nrg.min_power; - WARN_ON(abs(energy_diff) >= max_delta); -#endif - - /* Do scaling using positive numbers to increase the range */ - normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; - - /* Scale by energy magnitude */ - normalized_nrg <<= SCHED_LOAD_SHIFT; - - /* Normalize on max energy for target platform */ - normalized_nrg = reciprocal_divide( - normalized_nrg, schedtune_target_nrg.rdiv); - - return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; -} - #ifdef CONFIG_SCHED_DEBUG static void schedtune_test_nrg(unsigned long delta_pwr) diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index f7273a5d994a..7d2aa7951554 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -1,6 +1,17 @@ #ifdef CONFIG_SCHED_TUNE +#include + +/* + * System energy normalization constants + */ +struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +}; + #ifdef CONFIG_CGROUP_SCHEDTUNE int schedtune_cpu_boost(int cpu); @@ -25,7 +36,6 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) -#define schedtune_normalize_energy(energy) energy #define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta #endif /* CONFIG_SCHED_TUNE */ From 00aae8d5d5cd6f28d7603e0c1c4ac5cf91cb4aa3 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 28 Jul 2016 17:28:55 +0100 Subject: [PATCH 0211/3220] sched/tune: Add support for negative boost values Change-Id: I164ee04ba98c3a776605f18cb65ee61b3e917939 Contains also: eas/stune: schedtune cpu boost_max must be non-negative. This is to avoid under-accounting cpu capacity which may cause task stacking and frequency spikes. Change-Id: Ie1c1cbd52a6edb77b4c15a830030aa748dff6f29 --- include/trace/events/sched.h | 20 +++++++++---------- kernel/sched/fair.c | 37 ++++++++++++++++++++---------------- kernel/sched/tune.c | 25 +++++++++++++++--------- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 07ed295d0f23..7dc39dd384de 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -682,14 +682,14 @@ TRACE_EVENT(sched_tune_config, */ TRACE_EVENT(sched_boost_cpu, - TP_PROTO(int cpu, unsigned long util, unsigned long margin), + TP_PROTO(int cpu, unsigned long util, long margin), TP_ARGS(cpu, util, margin), TP_STRUCT__entry( __field( int, cpu ) __field( unsigned long, util ) - __field( unsigned long, margin ) + __field(long, margin ) ), TP_fast_assign( @@ -698,7 +698,7 @@ TRACE_EVENT(sched_boost_cpu, __entry->margin = margin; ), - TP_printk("cpu=%d util=%lu margin=%lu", + TP_printk("cpu=%d util=%lu margin=%ld", __entry->cpu, __entry->util, __entry->margin) @@ -710,7 +710,7 @@ TRACE_EVENT(sched_boost_cpu, TRACE_EVENT(sched_tune_tasks_update, TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, - unsigned int boost, unsigned int max_boost), + int boost, int max_boost), TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), @@ -720,8 +720,8 @@ TRACE_EVENT(sched_tune_tasks_update, __field( int, cpu ) __field( int, tasks ) __field( int, idx ) - __field( unsigned int, boost ) - __field( unsigned int, max_boost ) + __field( int, boost ) + __field( int, max_boost ) ), TP_fast_assign( @@ -735,7 +735,7 @@ TRACE_EVENT(sched_tune_tasks_update, ), TP_printk("pid=%d comm=%s " - "cpu=%d tasks=%d idx=%d boost=%u max_boost=%u", + "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d", __entry->pid, __entry->comm, __entry->cpu, __entry->tasks, __entry->idx, __entry->boost, __entry->max_boost) @@ -771,7 +771,7 @@ TRACE_EVENT(sched_tune_boostgroup_update, */ TRACE_EVENT(sched_boost_task, - TP_PROTO(struct task_struct *tsk, unsigned long util, unsigned long margin), + TP_PROTO(struct task_struct *tsk, unsigned long util, long margin), TP_ARGS(tsk, util, margin), @@ -779,7 +779,7 @@ TRACE_EVENT(sched_boost_task, __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( unsigned long, util ) - __field( unsigned long, margin ) + __field( long, margin ) ), @@ -790,7 +790,7 @@ TRACE_EVENT(sched_boost_task, __entry->margin = margin; ), - TP_printk("comm=%s pid=%d util=%lu margin=%lu", + TP_printk("comm=%s pid=%d util=%lu margin=%ld", __entry->comm, __entry->pid, __entry->util, __entry->margin) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b23dfefe6f1..aef41d9acd83 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5250,22 +5250,25 @@ static bool cpu_overutilized(int cpu) #ifdef CONFIG_SCHED_TUNE -static unsigned long -schedtune_margin(unsigned long signal, unsigned long boost) +static long +schedtune_margin(unsigned long signal, long boost) { - unsigned long long margin = 0; + long long margin = 0; /* * Signal proportional compensation (SPC) * * The Boost (B) value is used to compute a Margin (M) which is * proportional to the complement of the original Signal (S): - * M = B * (SCHED_LOAD_SCALE - S) + * M = B * (SCHED_LOAD_SCALE - S), if B is positive + * M = B * S, if B is negative * The obtained M could be used by the caller to "boost" S. */ - margin = SCHED_LOAD_SCALE - signal; - margin *= boost; - + if (boost >= 0) { + margin = SCHED_LOAD_SCALE - signal; + margin *= boost; + } else + margin = -signal * boost; /* * Fast integer division by constant: * Constant : (C) = 100 @@ -5281,13 +5284,15 @@ schedtune_margin(unsigned long signal, unsigned long boost) margin *= 1311; margin >>= 17; + if (boost < 0) + margin *= -1; return margin; } -static inline unsigned int +static inline int schedtune_cpu_margin(unsigned long util, int cpu) { - unsigned int boost; + int boost; #ifdef CONFIG_CGROUP_SCHEDTUNE boost = schedtune_cpu_boost(cpu); @@ -5300,12 +5305,12 @@ schedtune_cpu_margin(unsigned long util, int cpu) return schedtune_margin(util, boost); } -static inline unsigned long +static inline long schedtune_task_margin(struct task_struct *task) { - unsigned int boost; + int boost; unsigned long util; - unsigned long margin; + long margin; #ifdef CONFIG_CGROUP_SCHEDTUNE boost = schedtune_task_boost(task); @@ -5323,13 +5328,13 @@ schedtune_task_margin(struct task_struct *task) #else /* CONFIG_SCHED_TUNE */ -static inline unsigned int +static inline int schedtune_cpu_margin(unsigned long util, int cpu) { return 0; } -static inline unsigned int +static inline int schedtune_task_margin(struct task_struct *task) { return 0; @@ -5341,7 +5346,7 @@ static inline unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); - unsigned long margin = schedtune_cpu_margin(util, cpu); + long margin = schedtune_cpu_margin(util, cpu); trace_sched_boost_cpu(cpu, util, margin); @@ -5352,7 +5357,7 @@ static inline unsigned long boosted_task_util(struct task_struct *task) { unsigned long util = task_util(task); - unsigned long margin = schedtune_task_margin(task); + long margin = schedtune_task_margin(task); trace_sched_boost_task(task, util, margin); diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 8ca8db2de818..afc4a7747161 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -213,10 +213,11 @@ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { */ struct boost_groups { /* Maximum boost value for all RUNNABLE tasks on a CPU */ - unsigned boost_max; + bool idle; + int boost_max; struct { /* The boost for tasks on that boost group */ - unsigned boost; + int boost; /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; } group[BOOSTGROUPS_COUNT]; @@ -229,7 +230,7 @@ static void schedtune_cpu_update(int cpu) { struct boost_groups *bg; - unsigned boost_max; + int boost_max; int idx; bg = &per_cpu(cpu_boost_groups, cpu); @@ -243,9 +244,13 @@ schedtune_cpu_update(int cpu) */ if (bg->group[idx].tasks == 0) continue; + boost_max = max(boost_max, bg->group[idx].boost); } - + /* Ensures boost_max is non-negative when all cgroup boost values + * are neagtive. Avoids under-accounting of cpu capacity which may cause + * task stacking and frequency spikes.*/ + boost_max = max(boost_max, 0); bg->boost_max = boost_max; } @@ -391,7 +396,7 @@ int schedtune_task_boost(struct task_struct *p) return task_boost; } -static u64 +static s64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct schedtune *st = css_st(css); @@ -401,11 +406,13 @@ boost_read(struct cgroup_subsys_state *css, struct cftype *cft) static int boost_write(struct cgroup_subsys_state *css, struct cftype *cft, - u64 boost) + s64 boost) { struct schedtune *st = css_st(css); + unsigned threshold_idx; + int boost_pct; - if (boost < 0 || boost > 100) + if (boost < -100 || boost > 100) return -EINVAL; st->boost = boost; @@ -423,8 +430,8 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, static struct cftype files[] = { { .name = "boost", - .read_u64 = boost_read, - .write_u64 = boost_write, + .read_s64 = boost_read, + .write_s64 = boost_write, }, { } /* terminate */ }; From 254f5095a8143f0b852c5db7672957cf3b4d65d3 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 17:38:25 +0100 Subject: [PATCH 0212/3220] FIXUP: sched/tune: fix payoff calculation for boost region The definition of the acceptance regions as well as the translation of these regions into a payoff value was both wrong which turned out in: a) a wrong definition of payoff for the performance boost region b) a correct "by chance" definition of the payoff for the performance constraint region (i.e. two sign errors together fixing the formula) This patch provides a better description of the cut regions as well as a fixed version of the payoff computations, which are now reduced to a single formula usable for both cases. Reported-by: Leo Yan Reviewed-by: Leo Yan Signed-off-by: Leo Yan Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 79 +++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index afc4a7747161..6d5fbde9c70e 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -51,50 +51,51 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta, int perf_boost_idx, int perf_constrain_idx) { int payoff = -INT_MAX; + int gain_idx = -1; /* Performance Boost (B) region */ - if (nrg_delta > 0 && cap_delta > 0) { - /* - * Evaluate "Performance Boost" vs "Energy Increase" - * payoff criteria: - * cap_delta / nrg_delta < cap_gain / nrg_gain - * which is: - * nrg_delta * cap_gain > cap_delta * nrg_gain - */ - payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; - payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; - - trace_sched_tune_filter( - nrg_delta, cap_delta, - threshold_gains[perf_boost_idx].nrg_gain, - threshold_gains[perf_boost_idx].cap_gain, - payoff, 8); - - return payoff; - } - + if (nrg_delta >= 0 && cap_delta > 0) + gain_idx = perf_boost_idx; /* Performance Constraint (C) region */ - if (nrg_delta < 0 && cap_delta < 0) { - /* - * Evaluate "Performance Boost" vs "Energy Increase" - * payoff criteria: - * cap_delta / nrg_delta > cap_gain / nrg_gain - * which is: - * cap_delta * nrg_gain > nrg_delta * cap_gain - */ - payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; - payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; - - trace_sched_tune_filter( - nrg_delta, cap_delta, - threshold_gains[perf_constrain_idx].nrg_gain, - threshold_gains[perf_constrain_idx].cap_gain, - payoff, 6); - - return payoff; - } + else if (nrg_delta < 0 && cap_delta <= 0) + gain_idx = perf_constrain_idx; /* Default: reject schedule candidate */ + if (gain_idx == -1) + return payoff; + + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * + * - Performance Boost (B) region + * + * Condition: nrg_delta > 0 && cap_delta > 0 + * Payoff criteria: + * cap_gain / nrg_gain < cap_delta / nrg_delta = + * cap_gain * nrg_delta < cap_delta * nrg_gain + * Note that since both nrg_gain and nrg_delta are positive, the + * inequality does not change. Thus: + * + * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta) + * + * - Performance Constraint (C) region + * + * Condition: nrg_delta < 0 && cap_delta < 0 + * payoff criteria: + * cap_gain / nrg_gain > cap_delta / nrg_delta = + * cap_gain * nrg_delta < cap_delta * nrg_gain + * Note that since nrg_gain > 0 while nrg_delta < 0, the + * inequality change. Thus: + * + * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta) + * + * This means that, in case of same positive defined {cap,nrg}_gain + * for both the B and C regions, we can use the same payoff formula + * where a positive value represents the accept condition. + */ + payoff = cap_delta * threshold_gains[gain_idx].nrg_gain; + payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain; + return payoff; } From 274bbcfbe44127c5575ed847d48d201a66cc29ce Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 17:42:36 +0100 Subject: [PATCH 0213/3220] sched/{fair,tune}: simplify fair.c code The usage of conditional compiled code is discouraged in fair.c. This patch clean up a bit fair.c by moving schedtune_{cpu.task}_boost definitions into tune.h. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 21 +++------------------ kernel/sched/tune.h | 6 ++++++ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aef41d9acd83..6390b4cb916c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5077,18 +5077,13 @@ normalize_energy(int energy_diff) static inline int energy_diff(struct energy_env *eenv) { - unsigned int boost; + int boost = schedtune_task_boost(eenv->task); int nrg_delta; /* Conpute "absolute" energy diff */ __energy_diff(eenv); /* Return energy diff when boost margin is 0 */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(eenv->task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return eenv->nrg.diff; @@ -5292,13 +5287,8 @@ schedtune_margin(unsigned long signal, long boost) static inline int schedtune_cpu_margin(unsigned long util, int cpu) { - int boost; + int boost = schedtune_cpu_boost(cpu); -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_cpu_boost(cpu); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return 0; @@ -5308,15 +5298,10 @@ schedtune_cpu_margin(unsigned long util, int cpu) static inline long schedtune_task_margin(struct task_struct *task) { - int boost; + int boost = schedtune_task_boost(task); unsigned long util; long margin; -#ifdef CONFIG_CGROUP_SCHEDTUNE - boost = schedtune_task_boost(task); -#else - boost = get_sysctl_sched_cfs_boost(); -#endif if (boost == 0) return 0; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 7d2aa7951554..99637758a8af 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -22,6 +22,9 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #else /* CONFIG_CGROUP_SCHEDTUNE */ +#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() +#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost() + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) @@ -33,6 +36,9 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #else /* CONFIG_SCHED_TUNE */ +#define schedtune_cpu_boost(cpu) 0 +#define schedtune_task_boost(tsk) 0 + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) From 7f8f24a0eafe312899818cc07af8e9f1b33d9c39 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:19:41 +0100 Subject: [PATCH 0214/3220] sched/tune: use a single initialisation function With the introduction of initialization function required to compute the energy normalization constants from DTB at boot time, we have now a late_initcall which is already used by SchedTune. This patch consolidate within that function the other initialization bits which was previously deferred to the first CGroup creation. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/tune.c | 50 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 6d5fbde9c70e..a691b8db2888 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -410,8 +410,6 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 boost) { struct schedtune *st = css_st(css); - unsigned threshold_idx; - int boost_pct; if (boost < -100 || boost > 100) return -EINVAL; @@ -456,33 +454,14 @@ schedtune_boostgroup_init(struct schedtune *st) return 0; } -static int -schedtune_init(void) -{ - struct boost_groups *bg; - int cpu; - - /* Initialize the per CPU boost groups */ - for_each_possible_cpu(cpu) { - bg = &per_cpu(cpu_boost_groups, cpu); - memset(bg, 0, sizeof(struct boost_groups)); - } - - pr_info(" schedtune configured to support %d boost groups\n", - BOOSTGROUPS_COUNT); - return 0; -} - static struct cgroup_subsys_state * schedtune_css_alloc(struct cgroup_subsys_state *parent_css) { struct schedtune *st; int idx; - if (!parent_css) { - schedtune_init(); + if (!parent_css) return &root_schedtune.css; - } /* Allow only single level hierachies */ if (parent_css != &root_schedtune.css) { @@ -543,6 +522,22 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .early_init = 1, }; +static inline void +schedtune_init_cgroups(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + } + + pr_info("schedtune: configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); +} + #else /* CONFIG_CGROUP_SCHEDTUNE */ int @@ -690,7 +685,7 @@ schedtune_add_cluster_nrg( * that bind the EM to the topology information. */ static int -schedtune_init_late(void) +schedtune_init(void) { struct target_nrg *ste = &schedtune_target_nrg; unsigned long delta_pwr = 0; @@ -730,10 +725,17 @@ schedtune_init_late(void) ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); schedtune_test_nrg(delta_pwr); + +#ifdef CONFIG_CGROUP_SCHEDTUNE + schedtune_init_cgroups(); +#else + pr_info("schedtune: configured to support global boosting only\n"); +#endif + return 0; nodata: rcu_read_unlock(); return -EINVAL; } -late_initcall(schedtune_init_late); +late_initcall(schedtune_init); From af14760e19ebce67e8913fc0efd32cf839282a95 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 28 Jul 2016 18:44:40 +0100 Subject: [PATCH 0215/3220] FIXUP: sched/tune: fix accounting for runnable tasks Contains: sched/tune: fix accounting for runnable tasks (1/5) The accounting for tasks into boost groups of different CPUs is currently broken mainly because: a) we do not properly track the change of boost group of a RUNNABLE task b) there are race conditions between migration code and accounting code This patch provides a fixes to ensure enqueue/dequeue accounting also for throttled tasks. Without this patch is can happen that a task is enqueued into a throttled RQ thus not being accounted for the boosting of the corresponding RQ. We could argue that a throttled task should not boost a CPU, however: a) properly implementing CPU boosting considering throttled tasks will increase a lot the complexity of the solution b) it's not easy to quantify the benefits introduced by such a more complex solution Since task throttling requires the usage of the CFS bandwidth controller, which is not widely used on mobile systems (at least not by Android kernels so far), for the time being we go for the simple solution and boost also for throttled RQs. sched/tune: fix accounting for runnable tasks (2/5) This patch provides the code required to enforce proper locking. A per boost group spinlock has been added to grant atomic accounting of tasks as well as to serialise enqueue/dequeue operations, triggered by tasks migrations, with cgroups's attach/detach operations. sched/tune: fix accounting for runnable tasks (3/5) This patch adds cgroups {allow,can,cancel}_attach callbacks. Since a task can be migrated between boost groups while it's running, the CGroups's attach callbacks have been added to properly migrate boost contributions of RUNNABLE tasks. The RQ's lock is used to serialise enqueue/dequeue operations, triggered by tasks migrations, with cgroups's attach/detach operations. While the SchedTune's CPU lock is used to grant atrocity of the accounting within the CPU. NOTE: the current implementation does not allows a concurrent CPU migration and CGroups change. sched/tune: fix accounting for runnable tasks (4/5) This fixes accounting for exiting tasks by adding a dedicated call early in the do_exit() syscall, which disables SchedTune accounting as soon as a task is flagged PF_EXITING. This flag is set before the multiple dequeue/enqueue dance triggered by cgroup_exit() which is useful only to inject useless tasks movements thus increasing possibilities for race conditions with the migration code. The schedtune_exit_task() call does the last dequeue of a task from its current boost group. This is a solution more aligned with what happens in mainline kernels (>v4.4) where the exit_cgroup does not move anymore a dying task to the root control group. sched/tune: fix accounting for runnable tasks (5/5) To avoid accounting issues at startup, this patch disable the SchedTune accounting until the required data structures have been properly initialized. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/exit.c | 5 ++ kernel/sched/core.c | 12 +++ kernel/sched/fair.c | 11 ++- kernel/sched/sched.h | 3 + kernel/sched/tune.c | 182 ++++++++++++++++++++++++++++++++++++++----- kernel/sched/tune.h | 6 ++ 6 files changed, 195 insertions(+), 24 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 07110c6020a0..92ff63200287 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,6 +54,8 @@ #include #include +#include "sched/tune.h" + #include #include #include @@ -699,6 +701,9 @@ void do_exit(long code) } exit_signals(tsk); /* sets PF_EXITING */ + + schedtune_exit_task(tsk); + /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 42c45ccc1f62..f56a528c8ae7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -287,6 +287,18 @@ int sysctl_sched_rt_runtime = 950000; /* cpus with isolated domains */ cpumask_var_t cpu_isolated_map; +struct rq * +lock_rq_of(struct task_struct *p, unsigned long *flags) +{ + return task_rq_lock(p, flags); +} + +void +unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags) +{ + task_rq_unlock(rq, p, flags); +} + /* * this_rq_lock - lock this runqueue and disable interrupts. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6390b4cb916c..a26f40cb7fd0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4234,8 +4234,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; - schedtune_enqueue_task(p, cpu_of(rq)); - /* * We want to potentially trigger a freq switch * request only for tasks that are waking up; this is @@ -4246,6 +4244,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } + + /* Update SchedTune accouting */ + schedtune_enqueue_task(p, cpu_of(rq)); + #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -4311,7 +4313,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { - schedtune_dequeue_task(p, cpu_of(rq)); /* * We want to potentially trigger a freq switch @@ -4329,6 +4330,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } } + /* Update SchedTune accouting */ + schedtune_dequeue_task(p, cpu_of(rq)); + #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -5604,7 +5608,6 @@ static inline int find_best_target(struct task_struct *p) * The target CPU can be already at a capacity level higher * than the one required to boost the task. */ - if (new_util > capacity_orig_of(i)) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a06c19c48057..144d4782a455 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1701,6 +1701,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } +extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags); +extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags); + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index a691b8db2888..4c77cc23e65b 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -11,6 +11,10 @@ #include "sched.h" #include "tune.h" +#ifdef CONFIG_CGROUP_SCHEDTUNE +static bool schedtune_initialized = false; +#endif + unsigned int sysctl_sched_cfs_boost __read_mostly; extern struct target_nrg schedtune_target_nrg; @@ -222,6 +226,8 @@ struct boost_groups { /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; } group[BOOSTGROUPS_COUNT]; + /* CPU's boost group locking */ + raw_spinlock_t lock; }; /* Boost groups affecting each CPU in the system */ @@ -298,28 +304,24 @@ schedtune_boostgroup_update(int idx, int boost) return 0; } +#define ENQUEUE_TASK 1 +#define DEQUEUE_TASK -1 + static inline void schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) { - struct boost_groups *bg; - int tasks; - - bg = &per_cpu(cpu_boost_groups, cpu); + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + int tasks = bg->group[idx].tasks + task_count; /* Update boosted tasks count while avoiding to make it negative */ - if (task_count < 0 && bg->group[idx].tasks <= -task_count) - bg->group[idx].tasks = 0; - else - bg->group[idx].tasks += task_count; - - /* Boost group activation or deactivation on that RQ */ - tasks = bg->group[idx].tasks; - if (tasks == 1 || tasks == 0) - schedtune_cpu_update(cpu); + bg->group[idx].tasks = max(0, tasks); trace_sched_tune_tasks_update(p, cpu, tasks, idx, bg->group[idx].boost, bg->boost_max); + /* Boost group activation or deactivation on that RQ */ + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); } /* @@ -327,9 +329,14 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) */ void schedtune_enqueue_task(struct task_struct *p, int cpu) { + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; struct schedtune *st; int idx; + if (!unlikely(schedtune_initialized)) + return; + /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. @@ -339,13 +346,109 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) if (p->flags & PF_EXITING) return; - /* Get task boost group */ + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions for example on + * do_exit()::cgroup_exit() and task migration. + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); + st = task_schedtune(p); idx = st->idx; - rcu_read_unlock(); - schedtune_tasks_update(p, cpu, idx, 1); + schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +int schedtune_allow_attach(struct cgroup_taskset *tset) +{ + /* We always allows tasks to be moved between existing CGroups */ + return 0; +} + +int schedtune_can_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct boost_groups *bg; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int src_bg; /* Source boost group index */ + int dst_bg; /* Destination boost group index */ + int tasks; + + if (!unlikely(schedtune_initialized)) + return 0; + + + cgroup_taskset_for_each(task, css, tset) { + + /* + * Lock the CPU's RQ the task is enqueued to avoid race + * conditions with migration code while the task is being + * accounted + */ + rq = lock_rq_of(task, &irq_flags); + + if (!task->on_rq) { + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + cpu = cpu_of(rq); + bg = &per_cpu(cpu_boost_groups, cpu); + raw_spin_lock(&bg->lock); + + dst_bg = css_st(css)->idx; + src_bg = task_schedtune(task)->idx; + + /* + * Current task is not changing boostgroup, which can + * happen when the new hierarchy is in use. + */ + if (unlikely(dst_bg == src_bg)) { + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * This is the case of a RUNNABLE task which is switching its + * current boost group. + */ + + /* Move task from src to dst boost group */ + tasks = bg->group[src_bg].tasks - 1; + bg->group[src_bg].tasks = max(0, tasks); + bg->group[dst_bg].tasks += 1; + + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + + /* Update CPU boost group */ + if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1) + schedtune_cpu_update(task_cpu(task)); + + } + + return 0; +} + +void schedtune_cancel_attach(struct cgroup_taskset *tset) +{ + /* This can happen only if SchedTune controller is mounted with + * other hierarchies ane one of them fails. Since usually SchedTune is + * mouted on its own hierarcy, for the time being we do not implement + * a proper rollback mechanism */ + WARN(1, "SchedTune cancel attach not implemented"); } /* @@ -353,26 +456,62 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) */ void schedtune_dequeue_task(struct task_struct *p, int cpu) { + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; struct schedtune *st; int idx; + if (!unlikely(schedtune_initialized)) + return; + /* * When a task is marked PF_EXITING by do_exit() it's going to be * dequeued and enqueued multiple times in the exit path. * Thus we avoid any further update, since we do not want to change * CPU boosting while the task is exiting. - * The last dequeue will be done by cgroup exit() callback. + * The last dequeue is already enforce by the do_exit() code path + * via schedtune_exit_task(). */ if (p->flags & PF_EXITING) return; - /* Get task boost group */ + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); rcu_read_lock(); + st = task_schedtune(p); idx = st->idx; - rcu_read_unlock(); - schedtune_tasks_update(p, cpu, idx, -1); + schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +void schedtune_exit_task(struct task_struct *tsk) +{ + struct schedtune *st; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int idx; + + if (!unlikely(schedtune_initialized)) + return; + + rq = lock_rq_of(tsk, &irq_flags); + rcu_read_lock(); + + cpu = cpu_of(rq); + st = task_schedtune(tsk); + idx = st->idx; + schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + unlock_rq_of(rq, tsk, &irq_flags); } int schedtune_cpu_boost(int cpu) @@ -518,6 +657,9 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, +// .allow_attach = schedtune_allow_attach, + .can_attach = schedtune_can_attach, + .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 99637758a8af..be1785eb1c5b 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -17,6 +17,8 @@ struct target_nrg { int schedtune_cpu_boost(int cpu); int schedtune_task_boost(struct task_struct *tsk); +void schedtune_exit_task(struct task_struct *tsk); + void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); @@ -25,6 +27,8 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() #define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost() +#define schedtune_exit_task(task) do { } while (0) + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) @@ -39,6 +43,8 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta, #define schedtune_cpu_boost(cpu) 0 #define schedtune_task_boost(tsk) 0 +#define schedtune_exit_task(task) do { } while (0) + #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) From 369bcbbae4407a5b50f472a0fa3d569b9a832081 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 29 Jul 2016 14:41:25 +0100 Subject: [PATCH 0216/3220] sched/fair: optimize idle cpu selection for boosted tasks find_best_target CPU selection is biased towards lower CPU IDs. Bias towards higher CPUs for boosted tasks. For boosted tasks unconditionally use the idle CPU returned by find_best_target. BUG: 29512132 Change-Id: I3d650051752163fcf3dc7909751d1fde3f9d17c0 Conflicts: kernel/sched/fair.c --- kernel/sched/fair.c | 70 +++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a26f40cb7fd0..6efdcad51603 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5569,32 +5569,30 @@ done: return target; } -static inline int find_best_target(struct task_struct *p) +static inline int find_best_target(struct task_struct *p, bool boosted) { - int i, boosted; + int iter_cpu; int target_cpu = -1; int target_capacity = 0; int backup_capacity = 0; - int idle_cpu = -1; + int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; int backup_cpu = -1; unsigned long task_util_boosted, new_util; - /* - * Favor 1) busy cpu with most capacity at current OPP - * 2) idle_cpu with capacity at current OPP - * 3) busy cpu with capacity at higher OPP - */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - boosted = schedtune_task_boost(p); -#else - boosted = 0; -#endif task_util_boosted = boosted_task_util(p); - for_each_cpu(i, tsk_cpus_allowed(p)) { - int cur_capacity = capacity_curr_of(i); - struct rq *rq = cpu_rq(i); - int idle_idx = idle_get_state_idx(rq); + for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { + int cur_capacity; + struct rq *rq; + int idle_idx; + + /* + * favor higher cpus for boosted tasks + */ + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + + if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) + continue; /* * p's blocked utilization is still accounted for on prev_cpu @@ -5615,46 +5613,43 @@ static inline int find_best_target(struct task_struct *p) * For boosted tasks we favor idle cpus unconditionally to * improve latency. */ - if (idle_idx >= 0 && boosted) { - if (idle_cpu < 0 || - (sysctl_sched_cstate_aware && - best_idle_cstate > idle_idx)) { - best_idle_cstate = idle_idx; - idle_cpu = i; - } + if (idle_cpu(i) && boosted) { + if (best_idle_cpu < 0) + best_idle_cpu = i; continue; } + cur_capacity = capacity_curr_of(i); + rq = cpu_rq(i); + idle_idx = idle_get_state_idx(rq); + if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { if (target_capacity == 0 || target_capacity > cur_capacity) { - /* busy CPU with most capacity at current OPP */ target_cpu = i; target_capacity = cur_capacity; } } else if (!boosted) { - if (idle_cpu < 0 || + if (best_idle_cpu < 0 || (sysctl_sched_cstate_aware && best_idle_cstate > idle_idx)) { best_idle_cstate = idle_idx; - idle_cpu = i; + best_idle_cpu = i; } } } else if (backup_capacity == 0 || backup_capacity > cur_capacity) { - /* first busy CPU with capacity at higher OPP */ backup_capacity = cur_capacity; backup_cpu = i; } } - if (!boosted && target_cpu < 0) { - target_cpu = idle_cpu >= 0 ? idle_cpu : backup_cpu; - } + if (boosted && best_idle_cpu >= 0) + target_cpu = best_idle_cpu; + else if (target_cpu < 0) + target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; - if (boosted && idle_cpu >= 0) - target_cpu = idle_cpu; return target_cpu; } @@ -5740,9 +5735,16 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) /* * Find a cpu with sufficient capacity */ - int tmp_target = find_best_target(p); +#ifdef CONFIG_CGROUP_SCHEDTUNE + bool boosted = schedtune_task_boost(p) > 0; +#else + bool boosted = 0; +#endif + int tmp_target = find_best_target(p, boosted); if (tmp_target >= 0) target_cpu = tmp_target; + if (boosted && idle_cpu(target_cpu)) + return target_cpu; } if (target_cpu != task_cpu(p)) { From 345be81dba1ba6c7ac1df2b9ba355fc290d5a73b Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 15:32:26 +0100 Subject: [PATCH 0217/3220] sched/tune: fix PB and PC cuts indexes definition The current definition of the Performance Boost (PB) and Performance Constraint (PC) regions is has two main issues: 1) in the computation of the boost index we overflow the thresholds_gains table for boost=100 2) the two cuts had _NOT_ the same ratio The last point means that when boost=0 we do _not_ have a "standard" EAS behaviour, i.e. accepting all candidate which decrease energy regardless of their impact on performances. Instead, we accept only schedule candidate which are in the Optimal region, i.e. decrease energy while increasing performances. This behaviour can have a negative impact also on CPU selection policies which tries to spread tasks to reduce latencies. Indeed, for example we could end up rejecting a schedule candidate which want to move a task from a congested CPU to an idle one while, specifically in the case where the target CPU will be running on a lower OPP. This patch fixes these two issues by properly clamping the boost value in the appropriate range to compute the threshold indexes as well as by using the same threshold index for both cuts. Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan sched/tune: fix update of threshold index for boost groups When SchedTune is configured to work with CGroup mode, each time we update the boost value of a group we do not update the threshed indexes for the definition of the Performance Boost (PC) and Performance Constraint (PC) region. This means that while the OPP boosting and CPU biasing selection is working as expected, the __schedtune_accept_deltas function is always using the initial values for these cuts. This patch ensure that each time a new boost value is configured for a boost group, the cuts for the PB and PC region are properly updated too. Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan sched/tune: update PC and PB cuts definition The current definition of Performance Boost (PB) and Performance Constraint (PC) cuts defines two "dead regions": - up to 20% boost: we are in energy-reduction only mode, i.e. accept all candidate which reduce energy - over 70% boost: we are in performance-increase only mode, i.e. accept only sched candidate which do not reduce performances This patch uses a more fine grained configuration where these two "dead regions" are reduced to: up to 10% and over 90%. This should allow to have some boosting benefits starting from 10% boost values as well as not being to much permissive starting from boost values of 80%. Suggested-by: Leo Yan Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan bug: 28312446 Change-Id: Ia326c66521e38c98e7a7eddbbb7c437875efa1ba Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 58 ++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 4c77cc23e65b..d24f365b0c90 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -38,16 +38,16 @@ struct threshold_params { */ static struct threshold_params threshold_gains[] = { - { 0, 4 }, /* >= 0% */ - { 0, 4 }, /* >= 10% */ - { 1, 4 }, /* >= 20% */ - { 2, 4 }, /* >= 30% */ - { 3, 4 }, /* >= 40% */ - { 4, 3 }, /* >= 50% */ - { 4, 2 }, /* >= 60% */ - { 4, 1 }, /* >= 70% */ - { 4, 0 }, /* >= 80% */ - { 4, 0 } /* >= 90% */ + { 0, 5 }, /* < 10% */ + { 1, 5 }, /* < 20% */ + { 2, 5 }, /* < 30% */ + { 3, 5 }, /* < 40% */ + { 4, 5 }, /* < 50% */ + { 5, 4 }, /* < 60% */ + { 5, 3 }, /* < 70% */ + { 5, 2 }, /* < 80% */ + { 5, 1 }, /* < 90% */ + { 5, 0 } /* <= 100% */ }; static int @@ -549,13 +549,29 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, s64 boost) { struct schedtune *st = css_st(css); + unsigned threshold_idx; + int boost_pct; if (boost < -100 || boost > 100) return -EINVAL; + boost_pct = boost; + + /* + * Update threshold params for Performance Boost (B) + * and Performance Constraint (C) regions. + * The current implementatio uses the same cuts for both + * B and C regions. + */ + threshold_idx = clamp(boost_pct, 0, 99) / 10; + st->perf_boost_idx = threshold_idx; + st->perf_constrain_idx = threshold_idx; st->boost = boost; - if (css == &root_schedtune.css) + if (css == &root_schedtune.css) { sysctl_sched_cfs_boost = boost; + perf_boost_idx = threshold_idx; + perf_constrain_idx = threshold_idx; + } /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); @@ -710,17 +726,25 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + unsigned threshold_idx; + int boost_pct; if (ret || !write) return ret; - /* Performance Boost (B) region threshold params */ - perf_boost_idx = sysctl_sched_cfs_boost; - perf_boost_idx /= 10; + if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100) + return -EINVAL; + boost_pct = sysctl_sched_cfs_boost; - /* Performance Constraint (C) region threshold params */ - perf_constrain_idx = 100 - sysctl_sched_cfs_boost; - perf_constrain_idx /= 10; + /* + * Update threshold params for Performance Boost (B) + * and Performance Constraint (C) regions. + * The current implementatio uses the same cuts for both + * B and C regions. + */ + threshold_idx = clamp(boost_pct, 0, 99) / 10; + perf_boost_idx = threshold_idx; + perf_constrain_idx = threshold_idx; return 0; } From efb86bd08a2e9217d0b3c33753cf63d27e7c86da Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 31 May 2016 09:08:38 -0700 Subject: [PATCH 0218/3220] sched: Introduce Window Assisted Load Tracking (WALT) use a window based view of time in order to track task demand and CPU utilization in the scheduler. Window Assisted Load Tracking (WALT) implementation credits: Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park, Pavan Kumar Kondeti, Olav Haugan 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla and Todd Kjos Change-Id: I21408236836625d4e7d7de1843d20ed5ff36c708 Includes fixes for issues: eas/walt: Use walt_ktime_clock() instead of ktime_get_ns() to avoid a race resulting in watchdog resets BUG: 29353986 Change-Id: Ic1820e22a136f7c7ebd6f42e15f14d470f6bbbdb Handle walt accounting anomoly during resume During resume, there is a corner case where on wakeup, a task's prev_runnable_sum can go negative. This is a workaround that fixes the condition and warns (instead of crashing). BUG: 29464099 Change-Id: I173e7874324b31a3584435530281708145773508 Signed-off-by: Todd Kjos Signed-off-by: Srinath Sridharan Signed-off-by: Juri Lelli [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- include/linux/sched.h | 53 ++ include/linux/sched/sysctl.h | 5 + include/trace/events/sched.h | 149 +++++ init/Kconfig | 9 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 43 +- kernel/sched/fair.c | 20 + kernel/sched/rt.c | 4 + kernel/sched/sched.h | 34 ++ kernel/sched/stop_task.c | 3 + kernel/sched/walt.c | 1098 ++++++++++++++++++++++++++++++++++ kernel/sched/walt.h | 57 ++ kernel/sysctl.c | 23 + 13 files changed, 1498 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/walt.c create mode 100644 kernel/sched/walt.h diff --git a/include/linux/sched.h b/include/linux/sched.h index ee59abcab30d..ce8db12f9e72 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -317,6 +317,15 @@ extern char ___assert_task_state[1 - 2*!!( /* Task command name length */ #define TASK_COMM_LEN 16 +enum task_event { + PUT_PREV_TASK = 0, + PICK_NEXT_TASK = 1, + TASK_WAKE = 2, + TASK_MIGRATE = 3, + TASK_UPDATE = 4, + IRQ_UPDATE = 5, +}; + #include /* @@ -1274,6 +1283,41 @@ struct sched_statistics { }; #endif +#ifdef CONFIG_SCHED_WALT +#define RAVG_HIST_SIZE_MAX 5 + +/* ravg represents frequency scaled cpu-demand of tasks */ +struct ravg { + /* + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous + * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency + * demand for tasks. + * + * 'curr_window' represents task's contribution to cpu busy time + * statistics (rq->curr_runnable_sum) in current window + * + * 'prev_window' represents task's contribution to cpu busy time + * statistics (rq->prev_runnable_sum) in previous window + */ + u64 mark_start; + u32 sum, demand; + u32 sum_history[RAVG_HIST_SIZE_MAX]; + u32 curr_window, prev_window; + u16 active_windows; +}; +#endif + struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; @@ -1431,6 +1475,15 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_WALT + struct ravg ravg; + /* + * 'init_load_pct' represents the initial task load assigned to children + * of this task + */ + u32 init_load_pct; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 2834841c507e..710f58a28d63 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -43,6 +43,11 @@ extern unsigned int sysctl_sched_is_big_little; extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_initial_task_util; extern unsigned int sysctl_sched_cstate_aware; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int sysctl_sched_use_walt_task_util; +extern unsigned int sysctl_sched_walt_init_task_load_pct; +#endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 7dc39dd384de..33b1c719c9d7 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -888,6 +888,155 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +#ifdef CONFIG_SCHED_WALT +struct rq; + +TRACE_EVENT(walt_update_task_ravg, + + TP_PROTO(struct task_struct *p, struct rq *rq, int evt, + u64 wallclock, u64 irqtime), + + TP_ARGS(p, rq, evt, wallclock, irqtime), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( pid_t, cur_pid ) + __field(unsigned int, cur_freq ) + __field( u64, wallclock ) + __field( u64, mark_start ) + __field( u64, delta_m ) + __field( u64, win_start ) + __field( u64, delta ) + __field( u64, irqtime ) + __field( int, evt ) + __field(unsigned int, demand ) + __field(unsigned int, sum ) + __field( int, cpu ) + __field( u64, cs ) + __field( u64, ps ) + __field( u32, curr_window ) + __field( u32, prev_window ) + __field( u64, nt_cs ) + __field( u64, nt_ps ) + __field( u32, active_windows ) + ), + + TP_fast_assign( + __entry->wallclock = wallclock; + __entry->win_start = rq->window_start; + __entry->delta = (wallclock - rq->window_start); + __entry->evt = evt; + __entry->cpu = rq->cpu; + __entry->cur_pid = rq->curr->pid; + __entry->cur_freq = rq->cur_freq; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->mark_start = p->ravg.mark_start; + __entry->delta_m = (wallclock - p->ravg.mark_start); + __entry->demand = p->ravg.demand; + __entry->sum = p->ravg.sum; + __entry->irqtime = irqtime; + __entry->cs = rq->curr_runnable_sum; + __entry->ps = rq->prev_runnable_sum; + __entry->curr_window = p->ravg.curr_window; + __entry->prev_window = p->ravg.prev_window; + __entry->nt_cs = rq->nt_curr_runnable_sum; + __entry->nt_ps = rq->nt_prev_runnable_sum; + __entry->active_windows = p->ravg.active_windows; + ), + + TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu" + " cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u" + , __entry->wallclock, __entry->win_start, __entry->delta, + __entry->evt, __entry->cpu, + __entry->cur_freq, __entry->cur_pid, + __entry->pid, __entry->comm, __entry->mark_start, + __entry->delta_m, __entry->demand, + __entry->sum, __entry->irqtime, + __entry->cs, __entry->ps, + __entry->curr_window, __entry->prev_window, + __entry->nt_cs, __entry->nt_ps, + __entry->active_windows + ) +); + +TRACE_EVENT(walt_update_history, + + TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples, + int evt), + + TP_ARGS(rq, p, runtime, samples, evt), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field(unsigned int, runtime ) + __field( int, samples ) + __field( int, evt ) + __field( u64, demand ) + __field(unsigned int, walt_avg ) + __field(unsigned int, pelt_avg ) + __array( u32, hist, RAVG_HIST_SIZE_MAX) + __field( int, cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->runtime = runtime; + __entry->samples = samples; + __entry->evt = evt; + __entry->demand = p->ravg.demand; + __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window, + __entry->pelt_avg = p->se.avg.util_avg; + memcpy(__entry->hist, p->ravg.sum_history, + RAVG_HIST_SIZE_MAX * sizeof(u32)); + __entry->cpu = rq->cpu; + ), + + TP_printk("%d (%s): runtime %u samples %d event %d demand %llu" + " walt %u pelt %u (hist: %u %u %u %u %u) cpu %d", + __entry->pid, __entry->comm, + __entry->runtime, __entry->samples, __entry->evt, + __entry->demand, + __entry->walt_avg, + __entry->pelt_avg, + __entry->hist[0], __entry->hist[1], + __entry->hist[2], __entry->hist[3], + __entry->hist[4], __entry->cpu) +); + +TRACE_EVENT(walt_migration_update_sum, + + TP_PROTO(struct rq *rq, struct task_struct *p), + + TP_ARGS(rq, p), + + TP_STRUCT__entry( + __field(int, cpu ) + __field(int, pid ) + __field( u64, cs ) + __field( u64, ps ) + __field( s64, nt_cs ) + __field( s64, nt_ps ) + ), + + TP_fast_assign( + __entry->cpu = cpu_of(rq); + __entry->cs = rq->curr_runnable_sum; + __entry->ps = rq->prev_runnable_sum; + __entry->nt_cs = (s64)rq->nt_curr_runnable_sum; + __entry->nt_ps = (s64)rq->nt_prev_runnable_sum; + __entry->pid = p->pid; + ), + + TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d", + __entry->cpu, __entry->cs, __entry->ps, + __entry->nt_cs, __entry->nt_ps, __entry->pid) +); +#endif /* CONFIG_SCHED_WALT */ + #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ diff --git a/init/Kconfig b/init/Kconfig index facb2b587d5b..33f4e3965e3c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -392,6 +392,15 @@ config IRQ_TIME_ACCOUNTING endchoice +config SCHED_WALT + bool "Support window based load tracking" + depends on SMP + help + This feature will allow the scheduler to maintain a tunable window + based set of metrics for tasks and runqueues. These metrics can be + used to guide task placement as well as task frequency requirements + for cpufreq governors. + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 174762d8695b..623ce4bde0d5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,6 +15,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o +obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f56a528c8ae7..b6f41850ed00 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -89,6 +89,7 @@ #define CREATE_TRACE_POINTS #include +#include "walt.h" DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -1085,7 +1086,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(rq, cpu_rq(new_cpu)); set_task_cpu(p, new_cpu); + double_unlock_balance(rq, cpu_rq(new_cpu)); raw_spin_unlock(&rq->lock); rq = cpu_rq(new_cpu); @@ -1309,6 +1312,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; perf_event_task_migrate(p); + + walt_fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -1937,6 +1942,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; +#ifdef CONFIG_SMP + struct rq *rq; + u64 wallclock; +#endif /* * If we are going to wake up a thread waiting for CONDITION we @@ -1994,6 +2003,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); + rq = cpu_rq(task_cpu(p)); + + raw_spin_lock(&rq->lock); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + raw_spin_unlock(&rq->lock); + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -2001,10 +2018,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); @@ -2053,8 +2072,13 @@ static void try_to_wake_up_local(struct task_struct *p) trace_sched_waking(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + u64 wallclock = walt_ktime_clock(); + + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); + } ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); @@ -2120,6 +2144,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); + walt_init_new_task_load(p); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -2387,6 +2412,9 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + + walt_init_new_task_load(p); + /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP @@ -2399,6 +2427,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); + walt_mark_task_starting(p); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2948,9 +2977,12 @@ void scheduler_tick(void) sched_clock_tick(); raw_spin_lock(&rq->lock); + walt_set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); calc_global_load_tick(rq); sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); @@ -3188,6 +3220,7 @@ static void __sched notrace __schedule(bool preempt) unsigned long *switch_count; struct rq *rq; int cpu; + u64 wallclock; cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3249,6 +3282,9 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); next = pick_next_task(rq, prev); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@ -5669,6 +5705,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + raw_spin_lock_irqsave(&rq->lock, flags); + walt_set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; break; @@ -5688,6 +5727,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); + walt_migrate_sync_cpu(cpu); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); @@ -7532,6 +7572,7 @@ void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; + walt_init_cpu_efficiency(); alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6efdcad51603..79d54592b53a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -30,11 +30,13 @@ #include #include #include +#include #include #include "sched.h" #include "tune.h" +#include "walt.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -56,6 +58,10 @@ unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -4209,6 +4215,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); flags = ENQUEUE_WAKEUP; } @@ -4216,6 +4223,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4230,6 +4238,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { + walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; @@ -4279,6 +4288,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4299,6 +4309,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4313,6 +4324,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP if (!se) { + walt_dec_cumulative_runnable_avg(rq, p); /* * We want to potentially trigger a freq switch @@ -5207,6 +5219,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) static inline unsigned long task_util(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_task_util) { + unsigned long demand = p->ravg.demand; + return (demand << 10) / walt_ravg_window; + } +#endif return p->se.avg.util_avg; } @@ -6599,7 +6617,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + double_unlock_balance(env->src_rq, env->dst_rq); } /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9694204660b7..be700bfa1ae4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -8,6 +8,8 @@ #include #include +#include "walt.h" + int sched_rr_timeslice = RR_TIMESLICE; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -1261,6 +1263,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + walt_inc_cumulative_runnable_avg(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1272,6 +1275,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se); + walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 144d4782a455..bfd99f9bc6f9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -410,6 +410,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; @@ -663,6 +667,27 @@ struct rq { u64 max_idle_balance_cost; #endif +#ifdef CONFIG_SCHED_WALT + /* + * max_freq = user or thermal defined maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, min_freq, max_possible_freq; + struct cpumask freq_domain_cpumask; + + u64 cumulative_runnable_avg; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + int capacity; + int max_possible_capacity; + u64 window_start; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +#endif /* CONFIG_SCHED_WALT */ + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -1513,6 +1538,10 @@ static inline unsigned long capacity_orig_of(int cpu) return cpu_rq(cpu)->cpu_capacity_orig; } +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int walt_ravg_window; +extern unsigned int walt_disabled; + /* * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can @@ -1544,6 +1573,11 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) / + walt_ravg_window; +#endif delta += util; if (delta < 0) return 0; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index cbc67da10954..61f852d46858 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,4 +1,5 @@ #include "sched.h" +#include "walt.h" /* * stop-task scheduling class. @@ -42,12 +43,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); } static void yield_task_stop(struct rq *rq) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c new file mode 100644 index 000000000000..1dff3d2e2358 --- /dev/null +++ b/kernel/sched/walt.c @@ -0,0 +1,1098 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * Window Assisted Load Tracking (WALT) implementation credits: + * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park, + * Pavan Kumar Kondeti, Olav Haugan + * + * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla + * and Todd Kjos + */ + +#include +#include +#include +#include "sched.h" +#include "walt.h" + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define EXITING_TASK_MARKER 0xdeaddead + +static __read_mostly unsigned int walt_ravg_hist_size = 5; +static __read_mostly unsigned int walt_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; +static __read_mostly unsigned int walt_account_wait_time = 1; +static __read_mostly unsigned int walt_freq_account_wait_time = 0; +static __read_mostly unsigned int walt_io_is_busy = 0; + +unsigned int sysctl_sched_walt_init_task_load_pct = 15; + +/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ +unsigned int __read_mostly walt_disabled = 0; + +static unsigned int max_possible_efficiency = 1024; +static unsigned int min_possible_efficiency = 1024; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +static unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + * capacity (cpu_power) of cpus. + */ +static unsigned int min_max_freq = 1; + +static unsigned int max_capacity = 1024; +static unsigned int min_capacity = 1024; +static unsigned int max_load_scale_factor = 1024; +static unsigned int max_possible_capacity = 1024; + +/* Mask of all CPUs that have max_possible_capacity */ +static cpumask_t mpc_mask = CPU_MASK_ALL; + +/* Window size (in ns) */ +__read_mostly unsigned int walt_ravg_window = 20000000; + +/* Min window size (in ns) = 10ms */ +#define MIN_SCHED_RAVG_WINDOW 10000000 + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +static unsigned int sync_cpu; +static ktime_t ktime_last; +static bool walt_ktime_suspended; + +static unsigned int task_load(struct task_struct *p) +{ + return p->ravg.demand; +} + +void +walt_inc_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ + rq->cumulative_runnable_avg += p->ravg.demand; +} + +void +walt_dec_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ + rq->cumulative_runnable_avg -= p->ravg.demand; + BUG_ON((s64)rq->cumulative_runnable_avg < 0); +} + +static void +fixup_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p, s64 task_load_delta) +{ + rq->cumulative_runnable_avg += task_load_delta; + if ((s64)rq->cumulative_runnable_avg < 0) + panic("cra less than zero: tld: %lld, task_load(p) = %u\n", + task_load_delta, task_load(p)); +} + +u64 walt_ktime_clock(void) +{ + if (unlikely(walt_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void walt_resume(void) +{ + walt_ktime_suspended = false; +} + +static int walt_suspend(void) +{ + ktime_last = ktime_get(); + walt_ktime_suspended = true; + return 0; +} + +static struct syscore_ops walt_syscore_ops = { + .resume = walt_resume, + .suspend = walt_suspend +}; + +static int __init walt_init_ops(void) +{ + register_syscore_ops(&walt_syscore_ops); + return 0; +} +late_initcall(walt_init_ops); + +void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + cfs_rq->cumulative_runnable_avg += p->ravg.demand; +} + +void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + cfs_rq->cumulative_runnable_avg -= p->ravg.demand; +} + +static int exiting_task(struct task_struct *p) +{ + if (p->flags & PF_EXITING) { + if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) { + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + } + return 1; + } + return 0; +} + +static int __init set_walt_ravg_window(char *str) +{ + get_option(&str, &walt_ravg_window); + + walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || + walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + return 0; +} + +early_param("walt_ravg_window", set_walt_ravg_window); + +static void +update_window_start(struct rq *rq, u64 wallclock) +{ + s64 delta; + int nr_windows; + + delta = wallclock - rq->window_start; + BUG_ON(delta < 0); + if (delta < walt_ravg_window) + return; + + nr_windows = div64_u64(delta, walt_ravg_window); + rq->window_start += (u64)nr_windows * (u64)walt_ravg_window; +} + +static u64 scale_exec_time(u64 delta, struct rq *rq) +{ + unsigned int cur_freq = rq->cur_freq; + int sf; + + if (unlikely(cur_freq > max_possible_freq)) + cur_freq = rq->max_possible_freq; + + /* round up div64 */ + delta = div64_u64(delta * cur_freq + max_possible_freq - 1, + max_possible_freq); + + sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency); + + delta *= sf; + delta >>= 10; + + return delta; +} + +static int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!walt_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE || + event == TASK_UPDATE) + return 1; + + /* Only TASK_MIGRATE && PICK_NEXT_TASK left */ + return walt_freq_account_wait_time; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, nr_full_windows = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = walt_ravg_window; + u64 delta; + + new_window = mark_start < window_start; + if (new_window) { + nr_full_windows = div64_u64((window_start - mark_start), + window_size); + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + /* Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. */ + if (new_window && !is_idle_task(p) && !exiting_task(p)) { + u32 curr_window = 0; + + if (!nr_full_windows) + curr_window = p->ravg.curr_window; + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { + /* account_busy_for_cpu_time() = 0, so no update to the + * task's current window needs to be made. This could be + * for example + * + * - a wakeup event on a task within the current + * window (!new_window below, no action required), + * - switching to a new task from idle (PICK_NEXT_TASK) + * in a new window where irqtime is 0 and we aren't + * waiting on IO */ + + if (!new_window) + return; + + /* A new window has started. The RQ demand must be rolled + * over if p is the current task. */ + if (p_is_curr_task) { + u64 prev_sum = 0; + + /* p is either idle task or an exiting task */ + if (!nr_full_windows) { + prev_sum = rq->curr_runnable_sum; + } + + rq->prev_runnable_sum = prev_sum; + rq->curr_runnable_sum = 0; + } + + return; + } + + if (!new_window) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + rq->curr_runnable_sum += delta; + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window += delta; + + return; + } + + if (!p_is_curr_task) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. */ + + if (!nr_full_windows) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) + p->ravg.prev_window += delta; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) + p->ravg.prev_window = delta; + } + rq->prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + rq->curr_runnable_sum += delta; + if (!exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. */ + + if (!nr_full_windows) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window += delta; + + delta += rq->curr_runnable_sum; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window = delta; + + } + /* + * Rollover for normal runnable sum is done here by overwriting + * the values in prev_runnable_sum and curr_runnable_sum. + * Rollover for new task runnable sum has completed by previous + * if-else statement. + */ + rq->prev_runnable_sum = delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + rq->curr_runnable_sum = delta; + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (irqtime) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. */ + rq->prev_runnable_sum = rq->curr_runnable_sum; + if (mark_start > window_start) { + rq->curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* The IRQ busy time spanned multiple windows. Process the + * busy time preceding the current window start first. */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + rq->prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + + BUG(); +} + +static int account_busy_for_task_demand(struct task_struct *p, int event) +{ + /* No need to bother updating task demand for exiting tasks + * or the idle task. */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. */ + if (event == TASK_WAKE || (!walt_account_wait_time && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + return 1; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand; + u64 sum = 0; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = walt_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (walt_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (walt_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, walt_ravg_hist_size); + if (walt_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements hmp stats + * avoid decrementing it here again. + */ + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + fixup_cumulative_runnable_avg(rq, p, demand); + + p->ravg.demand = demand; + +done: + trace_walt_update_history(rq, p, runtime, samples, event); + return; +} + +static void add_to_task_demand(struct rq *rq, struct task_struct *p, + u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > walt_ravg_window)) + p->ravg.sum = walt_ravg_window; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static void update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = walt_ravg_window; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(p, event)) { + if (new_window) + /* If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. */ + update_history(rq, p, p->ravg.sum, 1, event); + return; + } + + if (!new_window) { + /* The simple case - busy time contained within the existing + * window. */ + add_to_task_demand(rq, p, wallclock - mark_start); + return; + } + + /* Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) + update_history(rq, p, scale_exec_time(window_size, rq), + nr_full_windows, event); + + /* Roll window_start back to current to process any remainder + * in current window. */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + add_to_task_demand(rq, p, wallclock - mark_start); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + if (walt_disabled || !rq->window_start) + return; + + lockdep_assert_held(&rq->lock); + + update_window_start(rq, wallclock); + + if (!p->ravg.mark_start) + goto done; + + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + +done: + trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime); + + p->ravg.mark_start = wallclock; +} + +unsigned long __weak arch_get_cpu_efficiency(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +void walt_init_cpu_efficiency(void) +{ + int i, efficiency; + unsigned int max = 0, min = UINT_MAX; + + for_each_possible_cpu(i) { + efficiency = arch_get_cpu_efficiency(i); + cpu_rq(i)->efficiency = efficiency; + + if (efficiency > max) + max = efficiency; + if (efficiency < min) + min = efficiency; + } + + if (max) + max_possible_efficiency = max; + + if (min) + min_possible_efficiency = min; +} + +static void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + + if (exiting_task(p)) + sum = EXITING_TASK_MARKER; + + memset(&p->ravg, 0, sizeof(struct ravg)); + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void walt_mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start) { + reset_task_stats(p); + return; + } + + wallclock = walt_ktime_clock(); + p->ravg.mark_start = wallclock; +} + +void walt_set_window_start(struct rq *rq) +{ + int cpu = cpu_of(rq); + struct rq *sync_rq = cpu_rq(sync_cpu); + + if (rq->window_start) + return; + + if (cpu == sync_cpu) { + rq->window_start = walt_ktime_clock(); + } else { + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = cpu_rq(sync_cpu)->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +void walt_migrate_sync_cpu(int cpu) +{ + if (cpu == sync_cpu) + sync_cpu = smp_processor_id(); +} + +void walt_fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + if (exiting_task(p)) { + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + wallclock = walt_ktime_clock(); + + walt_update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); + + if (p->ravg.curr_window) { + src_rq->curr_runnable_sum -= p->ravg.curr_window; + dest_rq->curr_runnable_sum += p->ravg.curr_window; + } + + if (p->ravg.prev_window) { + src_rq->prev_runnable_sum -= p->ravg.prev_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + } + + if ((s64)src_rq->prev_runnable_sum < 0) { + src_rq->prev_runnable_sum = 0; + WARN_ON(1); + } + if ((s64)src_rq->curr_runnable_sum < 0) { + src_rq->curr_runnable_sum = 0; + WARN_ON(1); + } + + trace_walt_migration_update_sum(src_rq, p); + trace_walt_migration_update_sum(dest_rq, p); + + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +/* Keep track of max/min capacity possible across CPUs "currently" */ +static void __update_min_max_capacity(void) +{ + int i; + int max = 0, min = INT_MAX; + + for_each_online_cpu(i) { + if (cpu_rq(i)->capacity > max) + max = cpu_rq(i)->capacity; + if (cpu_rq(i)->capacity < min) + min = cpu_rq(i)->capacity; + } + + max_capacity = max; + min_capacity = min; +} + +static void update_min_max_capacity(void) +{ + unsigned long flags; + int i; + + local_irq_save(flags); + for_each_possible_cpu(i) + raw_spin_lock(&cpu_rq(i)->lock); + + __update_min_max_capacity(); + + for_each_possible_cpu(i) + raw_spin_unlock(&cpu_rq(i)->lock); + local_irq_restore(flags); +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long capacity_scale_cpu_efficiency(int cpu) +{ + return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(int cpu) +{ + return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static unsigned long load_scale_cpu_efficiency(int cpu) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cpu_rq(cpu)->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static unsigned long load_scale_cpu_freq(int cpu) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq); +} + +static int compute_capacity(int cpu) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cpu); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cpu); + capacity >>= 10; + + return capacity; +} + +static int compute_load_scale_factor(int cpu) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cpu); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cpu); + load_scale >>= 10; + + return load_scale; +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + int i, update_max = 0; + u64 highest_mpc = 0, highest_mplsf = 0; + const struct cpumask *cpus = policy->related_cpus; + unsigned int orig_min_max_freq = min_max_freq; + unsigned int orig_max_possible_freq = max_possible_freq; + /* Initialized to policy->max in case policy->related_cpus is empty! */ + unsigned int orig_max_freq = policy->max; + + if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && + val != CPUFREQ_CREATE_POLICY) + return 0; + + if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { + update_min_max_capacity(); + return 0; + } + + for_each_cpu(i, policy->related_cpus) { + cpumask_copy(&cpu_rq(i)->freq_domain_cpumask, + policy->related_cpus); + orig_max_freq = cpu_rq(i)->max_freq; + cpu_rq(i)->min_freq = policy->min; + cpu_rq(i)->max_freq = policy->max; + cpu_rq(i)->cur_freq = policy->cur; + cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq; + } + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + /* Changes to policy other than max_freq don't require any updates */ + if (orig_max_freq == policy->max) + return 0; + + /* + * A changed min_max_freq or max_possible_freq (possible during bootup) + * needs to trigger re-computation of load_scale_factor and capacity for + * all possible cpus (even those offline). It also needs to trigger + * re-computation of nr_big_task count on all online cpus. + * + * A changed rq->max_freq otoh needs to trigger re-computation of + * load_scale_factor and capacity for just the cluster of cpus involved. + * Since small task definition depends on max_load_scale_factor, a + * changed load_scale_factor of one cluster could influence + * classification of tasks in another cluster. Hence a changed + * rq->max_freq will need to trigger re-computation of nr_big_task + * count on all online cpus. + * + * While it should be sufficient for nr_big_tasks to be + * re-computed for only online cpus, we have inadequate context + * information here (in policy notifier) with regard to hotplug-safety + * context in which notification is issued. As a result, we can't use + * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is + * fixed up to issue notification always in hotplug-safe context, + * re-compute nr_big_task for all possible cpus. + */ + + if (orig_min_max_freq != min_max_freq || + orig_max_possible_freq != max_possible_freq) { + cpus = cpu_possible_mask; + update_max = 1; + } + + /* + * Changed load_scale_factor can trigger reclassification of tasks as + * big or small. Make this change "atomic" so that tasks are accounted + * properly due to changed load_scale_factor + */ + for_each_cpu(i, cpus) { + struct rq *rq = cpu_rq(i); + + rq->capacity = compute_capacity(i); + rq->load_scale_factor = compute_load_scale_factor(i); + + if (update_max) { + u64 mpc, mplsf; + + mpc = div_u64(((u64) rq->capacity) * + rq->max_possible_freq, rq->max_freq); + rq->max_possible_capacity = (int) mpc; + + mplsf = div_u64(((u64) rq->load_scale_factor) * + rq->max_possible_freq, rq->max_freq); + + if (mpc > highest_mpc) { + highest_mpc = mpc; + cpumask_clear(&mpc_mask); + cpumask_set_cpu(i, &mpc_mask); + } else if (mpc == highest_mpc) { + cpumask_set_cpu(i, &mpc_mask); + } + + if (mplsf > highest_mplsf) + highest_mplsf = mplsf; + } + } + + if (update_max) { + max_possible_capacity = highest_mpc; + max_load_scale_factor = highest_mplsf; + } + + __update_min_max_capacity(); + + return 0; +} + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->cpu, new_freq = freq->new; + unsigned long flags; + int i; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + BUG_ON(!new_freq); + + if (cpu_rq(cpu)->cur_freq == new_freq) + return 0; + + for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) { + struct rq *rq = cpu_rq(i); + + raw_spin_lock_irqsave(&rq->lock, flags); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); + rq->cur_freq = new_freq; + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static int register_sched_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + return 0; +} + +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_sched_callback); + +void walt_init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = + div64_u64((u64)sysctl_sched_walt_init_task_load_pct * + (u64)walt_ravg_window, 100); + u32 init_load_pct = current->init_load_pct; + + p->init_load_pct = 0; + memset(&p->ravg, 0, sizeof(struct ravg)); + + if (init_load_pct) { + init_load_windows = div64_u64((u64)init_load_pct * + (u64)walt_ravg_window, 100); + } + + p->ravg.demand = init_load_windows; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h new file mode 100644 index 000000000000..cabc193a683d --- /dev/null +++ b/kernel/sched/walt.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __WALT_H +#define __WALT_H + +#ifdef CONFIG_SCHED_WALT + +void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); +void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p); +void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p); +void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p); +void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p); +void walt_fixup_busy_time(struct task_struct *p, int new_cpu); +void walt_init_new_task_load(struct task_struct *p); +void walt_mark_task_starting(struct task_struct *p); +void walt_set_window_start(struct rq *rq); +void walt_migrate_sync_cpu(int cpu); +void walt_init_cpu_efficiency(void); +u64 walt_ktime_clock(void); + +#else /* CONFIG_SCHED_WALT */ + +static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } +static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { } +static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { } +static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p) { } +static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p) { } +static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void walt_init_new_task_load(struct task_struct *p) { } +static inline void walt_mark_task_starting(struct task_struct *p) { } +static inline void walt_set_window_start(struct rq *rq) { } +static inline void walt_migrate_sync_cpu(int cpu) { } +static inline void walt_init_cpu_efficiency(void) { } +static inline u64 walt_ktime_clock(void) { return 0; } + +#endif /* CONFIG_SCHED_WALT */ + +extern unsigned int walt_disabled; + +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4457e107ec20..43c59da62f1e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -310,6 +310,29 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_WALT + { + .procname = "sched_use_walt_cpu_util", + .data = &sysctl_sched_use_walt_cpu_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_use_walt_task_util", + .data = &sysctl_sched_use_walt_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_walt_init_task_load_pct", + .data = &sysctl_sched_walt_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "sched_sync_hint_enable", .data = &sysctl_sched_sync_hint_enable, From 519c62750eb6ebbb5783315272398ced72d7a036 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Fri, 22 Jul 2016 13:21:15 +0100 Subject: [PATCH 0219/3220] sched/walt: Accounting for number of irqs pending on each core Schedules on a core whose irq count is less than a threshold. Improves I/O performance of EAS. Change-Id: I08ff7dd0d22502a0106fc636b1af2e6fe9e758b5 --- include/linux/sched/sysctl.h | 1 + kernel/sched/core.c | 5 +++ kernel/sched/cputime.c | 16 +++++++++ kernel/sched/fair.c | 7 +++- kernel/sched/sched.h | 3 ++ kernel/sched/walt.c | 65 ++++++++++++++++++++++++++++++++++++ kernel/sched/walt.h | 5 +++ kernel/sysctl.c | 7 ++++ 8 files changed, 108 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 710f58a28d63..d68e88c9d4d7 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -47,6 +47,7 @@ extern unsigned int sysctl_sched_cstate_aware; extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int sysctl_sched_use_walt_task_util; extern unsigned int sysctl_sched_walt_init_task_load_pct; +extern unsigned int sysctl_sched_walt_cpu_high_irqload; #endif enum sched_tunable_scaling { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b6f41850ed00..deb45d1b4e49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7750,6 +7750,11 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_WALT + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; +#endif INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 05de80b48586..442a9f7a2832 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -5,6 +5,7 @@ #include #include #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -49,6 +50,10 @@ void irqtime_account_irq(struct task_struct *curr) unsigned long flags; s64 delta; int cpu; +#ifdef CONFIG_SCHED_WALT + u64 wallclock; + bool account = true; +#endif if (!sched_clock_irqtime) return; @@ -56,6 +61,9 @@ void irqtime_account_irq(struct task_struct *curr) local_irq_save(flags); cpu = smp_processor_id(); +#ifdef CONFIG_SCHED_WALT + wallclock = sched_clock_cpu(cpu); +#endif delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -70,8 +78,16 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_hardirq_time, delta); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); +#ifdef CONFIG_SCHED_WALT + else + account = false; +#endif irq_time_write_end(); +#ifdef CONFIG_SCHED_WALT + if (account) + walt_account_irqtime(cpu, curr, delta, wallclock); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79d54592b53a..0ba5653682e8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -61,6 +61,8 @@ unsigned int sysctl_sched_cstate_aware = 1; #ifdef CONFIG_SCHED_WALT unsigned int sysctl_sched_use_walt_cpu_util = 1; unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); #endif /* * The initial- and re-scaling of tunables is configurable @@ -4258,7 +4260,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) schedtune_enqueue_task(p, cpu_of(rq)); #endif /* CONFIG_SMP */ - hrtick_update(rq); } @@ -5627,6 +5628,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) if (new_util > capacity_orig_of(i)) continue; +#ifdef CONFIG_SCHED_WALT + if (walt_cpu_high_irqload(i)) + continue; +#endif /* * For boosted tasks we favor idle cpus unconditionally to * improve latency. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bfd99f9bc6f9..911d3a94690e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -685,6 +685,9 @@ struct rq { u64 prev_runnable_sum; u64 nt_curr_runnable_sum; u64 nt_prev_runnable_sum; + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; #endif /* CONFIG_SCHED_WALT */ diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 1dff3d2e2358..b9ae8d5c4393 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -221,6 +221,71 @@ static int cpu_is_waiting_on_io(struct rq *rq) return atomic_read(&rq->nr_iowait); } +void walt_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + + +#define WALT_HIGH_IRQ_TIMEOUT 3 + +u64 walt_irqload(int cpu) { + struct rq *rq = cpu_rq(cpu); + s64 delta; + delta = get_jiffies_64() - rq->irqload_ts; + + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < WALT_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +int walt_cpu_high_irqload(int cpu) { + return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload; +} + static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, u64 irqtime, int event) { diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index cabc193a683d..e181c87a928d 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -31,6 +31,11 @@ void walt_set_window_start(struct rq *rq); void walt_migrate_sync_cpu(int cpu); void walt_init_cpu_efficiency(void); u64 walt_ktime_clock(void); +void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta, + u64 wallclock); + +u64 walt_irqload(int cpu); +int walt_cpu_high_irqload(int cpu); #else /* CONFIG_SCHED_WALT */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 43c59da62f1e..2b89e8ff0688 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -332,6 +332,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_walt_cpu_high_irqload", + .data = &sysctl_sched_walt_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif { .procname = "sched_sync_hint_enable", From dfc1151b46bb5862154a8be2be867dbc1c8aaeb8 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 30 Jun 2016 15:00:41 +0100 Subject: [PATCH 0220/3220] FIXUP: sched: fix set_cfs_cpu_capacity when WALT is in use The CPU utilization reported when WALT is in use already tracks the contributions due to RT and DL workloads. However, SchedFreq exposes different capacity update functions, one for each class, and does classes utilization internally at update_cpu_capacity_request() call time. This patch ensures that when WALT is in use, the: cpu_sched_capacity_reqs::cfs value is tracking just the load generated by SCHED_OTHER tasks. Change-Id: Ibd9c9a10874a1d91f62477034548f7664e57cd6a Signed-off-by: Patrick Bellasi --- kernel/sched/sched.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 911d3a94690e..3111d74a5a85 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1611,8 +1611,27 @@ void update_cpu_capacity_request(int cpu, bool request); static inline void set_cfs_cpu_capacity(int cpu, bool request, unsigned long capacity) { - if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) { - per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity; + struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + int rtdl = scr->rt + scr->dl; + /* + * WALT tracks the utilization of a CPU considering the load + * generated by all the scheduling classes. + * Since the following call to: + * update_cpu_capacity + * is already adding the RT and DL utilizations let's remove + * these contributions from the WALT signal. + */ + if (capacity > rtdl) + capacity -= rtdl; + else + capacity = 0; + } +#endif + if (scr->cfs != capacity) { + scr->cfs = capacity; update_cpu_capacity_request(cpu, request); } } From 782c9d64a1d12aaf3d6decf7b54e212343bb3f7e Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 29 Jun 2016 11:30:07 -0700 Subject: [PATCH 0221/3220] sched: EAS: Avoid causing spikes to max-freq unnecessarily During scheduler tick handling, the frequency was being set to max-freq if the current frequency is less than the current utilization. Change to just request "right" frequency instead of max. BUG: 29871410 Change-Id: I6fe65b14413da44b1520ba116f72320083eb92f8 --- kernel/sched/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index deb45d1b4e49..f07615a1674f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2940,7 +2940,7 @@ static unsigned long sum_capacity_reqs(unsigned long cfs_cap, static void sched_freq_tick(int cpu) { struct sched_capacity_reqs *scr; - unsigned long capacity_orig, capacity_curr; + unsigned long capacity_orig, capacity_curr, capacity_sum; if (!sched_freq()) return; @@ -2953,12 +2953,15 @@ static void sched_freq_tick(int cpu) /* * To make free room for a task that is building up its "real" * utilization and to harm its performance the least, request - * a jump to max OPP as soon as the margin of free capacity is - * impacted (specified by capacity_margin). + * a jump to a higher OPP as soon as the margin of free capacity + * is impacted (specified by capacity_margin). */ + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr)) - set_cfs_cpu_capacity(cpu, true, capacity_max); + capacity_sum = sum_capacity_reqs(cpu_util(cpu), scr); + if (capacity_curr < capacity_sum) { + set_cfs_cpu_capacity(cpu, true, capacity_sum); + } } #else static inline void sched_freq_tick(int cpu) { } From 5156b672040dfd29ba141e02689e1e92e34d525f Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 30 Jun 2016 15:09:24 +0100 Subject: [PATCH 0222/3220] FIXUP: sched: fix SchedFreq integration for both PELT and WALT The current kernel allows to use either PELT or WALT to track CPUs utilizations. One of the main differences between the two approaches is that PELT tracks only utilization of SCHED_OTHER classes while WALT tracks all tasks with a single signal. The current sched_freq_tick does not make this distinction and, when WALT is in use, we end up adding multiple time the contribution related to the RT and DL classes. This patch fixes this issue by: 1. providing two different code paths for PELT and WALT, thus granting that when we switch to PELT we get the original behaviour based on the assumption that class aggregations is done underneath by SchedFreq. 2. avoiding the double accounting of DL and RT workloads, when WALT is in use, by just adding a margin to the original WALT signal when we need to check if the CFS capacity has to be increased. Change-Id: I7326fd50e868e97fb5e12351917e9d2969bfdae7 Signed-off-by: Patrick Bellasi --- kernel/sched/core.c | 91 +++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f07615a1674f..02743ffe7ff3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2926,21 +2926,77 @@ unsigned long long task_sched_runtime(struct task_struct *p) } #ifdef CONFIG_CPU_FREQ_GOV_SCHED -static unsigned long sum_capacity_reqs(unsigned long cfs_cap, - struct sched_capacity_reqs *scr) -{ - unsigned long total = cfs_cap + scr->rt; - total = total * capacity_margin; - total /= SCHED_CAPACITY_SCALE; - total += scr->dl; - return total; +static inline +unsigned long add_capacity_margin(unsigned long cpu_capacity) +{ + cpu_capacity = cpu_capacity * capacity_margin; + cpu_capacity /= SCHED_CAPACITY_SCALE; + return cpu_capacity; } +static inline +unsigned long sum_capacity_reqs(unsigned long cfs_cap, + struct sched_capacity_reqs *scr) +{ + unsigned long total = add_capacity_margin(cfs_cap + scr->rt); + return total += scr->dl; +} + +static void sched_freq_tick_pelt(int cpu) +{ + unsigned long cpu_utilization = capacity_max; + unsigned long capacity_curr = capacity_curr_of(cpu); + struct sched_capacity_reqs *scr; + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr) + return; + + /* + * To make free room for a task that is building up its "real" + * utilization and to harm its performance the least, request + * a jump to a higher OPP as soon as the margin of free capacity + * is impacted (specified by capacity_margin). + */ + set_cfs_cpu_capacity(cpu, true, cpu_utilization); +} + +#ifdef CONFIG_SCHED_WALT +static void sched_freq_tick_walt(int cpu) +{ + unsigned long cpu_utilization = cpu_util(cpu); + unsigned long capacity_curr = capacity_curr_of(cpu); + + if (walt_disabled || !sysctl_sched_use_walt_cpu_util) + return sched_freq_tick_pelt(cpu); + + /* + * Add a margin to the WALT utilization. + * NOTE: WALT tracks a single CPU signal for all the scheduling + * classes, thus this margin is going to be added to the DL class as + * well, which is something we do not do in sched_freq_tick_pelt case. + */ + cpu_utilization = add_capacity_margin(cpu_utilization); + if (cpu_utilization <= capacity_curr) + return; + + /* + * It is likely that the load is growing so we + * keep the added margin in our request as an + * extra boost. + */ + set_cfs_cpu_capacity(cpu, true, cpu_utilization); + +} +#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu) +#else +#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu) +#endif /* CONFIG_SCHED_WALT */ + static void sched_freq_tick(int cpu) { - struct sched_capacity_reqs *scr; - unsigned long capacity_orig, capacity_curr, capacity_sum; + unsigned long capacity_orig, capacity_curr; if (!sched_freq()) return; @@ -2950,22 +3006,11 @@ static void sched_freq_tick(int cpu) if (capacity_curr == capacity_orig) return; - /* - * To make free room for a task that is building up its "real" - * utilization and to harm its performance the least, request - * a jump to a higher OPP as soon as the margin of free capacity - * is impacted (specified by capacity_margin). - */ - - scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - capacity_sum = sum_capacity_reqs(cpu_util(cpu), scr); - if (capacity_curr < capacity_sum) { - set_cfs_cpu_capacity(cpu, true, capacity_sum); - } + _sched_freq_tick(cpu); } #else static inline void sched_freq_tick(int cpu) { } -#endif +#endif /* CONFIG_CPU_FREQ_GOV_SCHED */ /* * This function gets called by the timer code, with HZ frequency. From 740d312ce8aae83956ed1b34b8a4e72b60b04a8f Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 16 Jun 2016 16:33:54 -0700 Subject: [PATCH 0223/3220] FIXUP: sched/fair: Fix hang during suspend in sched_group_energy BUG: 29353986 Change-Id: I0d0d8d5c107a2e0bd219819e036091106bb40e11 --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0ba5653682e8..0b065ff2f4f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4980,6 +4980,7 @@ static int sched_group_energy(struct energy_env *eenv) } while (sg = sg->next, sg != sd->groups); } next_cpu: + cpumask_clear_cpu(cpu, &visit_cpus); continue; } From 8935b6b4d230ae7969ccdfaf7baf09d818f88c8b Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 4 Jul 2016 15:04:45 +0100 Subject: [PATCH 0224/3220] FIXUP: sched: Fix double-release of spinlock in move_queued_task BUG: 29519455 Change-Id: I4d1c27a1b4bcbba03d4b175d170cfe1701a90ffd --- kernel/sched/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3111d74a5a85..4e2e44e3cc7b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1832,7 +1832,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - raw_spin_unlock(&busiest->lock); + if (this_rq != busiest) + raw_spin_unlock(&busiest->lock); lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } From 23ed57dbcc14871af54db667d460aba64eaf6efe Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 25 Jul 2016 15:13:58 +0100 Subject: [PATCH 0225/3220] arch_timer: add error handling when the MPM global timer is cleared Bug: 29000863 Signed-off-by: albert.zl_huang Change-Id: I2b5a28b0a9edb31bdaa1ca2310397dd2f36f6c23 Updated to use arch_timer_read_counter() as arch_counter_get_cntvct doesn't exist in this kernel. Signed-off-by: Chris Redpath --- kernel/sched/walt.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index b9ae8d5c4393..d9d09914ce30 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -185,7 +185,14 @@ update_window_start(struct rq *rq, u64 wallclock) int nr_windows; delta = wallclock - rq->window_start; - BUG_ON(delta < 0); + /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */ + if (delta < 0) { + if (arch_timer_read_counter() == 0) + delta = 0; + else + BUG_ON(1); + } + if (delta < walt_ravg_window) return; From d4cda03828f5c8eae35efcb08f520f8f1a35950e Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 13 Jul 2016 16:13:47 -0700 Subject: [PATCH 0226/3220] sched: use util instead of capacity to select busy cpu If cpus are busy, the cpu selection algorithm was favoring cpus with lower capacity. This can result in uneven packing since there will be a bias toward the same cpu until there is a capacity change. Instead use the utilization so there is immediate feedback as tasks are assigned BUG: 30115868 Change-Id: I0ac7ae3ab5d8f2f5a5838c29bb6da2c3e8ef44e8 --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b065ff2f4f3..7b6e95aa7360 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5593,7 +5593,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) { int iter_cpu; int target_cpu = -1; - int target_capacity = 0; + int target_util = 0; int backup_capacity = 0; int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; @@ -5649,10 +5649,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if (target_capacity == 0 || - target_capacity > cur_capacity) { + if (target_util == 0 || + target_util > new_util) { target_cpu = i; - target_capacity = cur_capacity; + target_util = new_util; } } else if (!boosted) { if (best_idle_cpu < 0 || From c5a00c2dad8d161da3c2086cccd6375d8ad5b04f Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Thu, 14 Jul 2016 13:09:03 -0700 Subject: [PATCH 0227/3220] sched/tune: Introducing a new schedtune attribute prefer_idle Hint to enable biasing of tasks towards idle cpus, even when a given task is negatively boosted. The mechanism allows upto 20% reduction in camera power without hurting performance. bug: 28312446 Change-Id: I97ea5671aa1e6bcb165408b41e17bc82e41c2c9e --- kernel/sched/fair.c | 23 +++++++++++++---------- kernel/sched/tune.c | 42 ++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 2 ++ 3 files changed, 57 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7b6e95aa7360..e099ce747345 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5589,7 +5589,7 @@ done: return target; } -static inline int find_best_target(struct task_struct *p, bool boosted) +static inline int find_best_target(struct task_struct *p, bool prefer_idle) { int iter_cpu; int target_cpu = -1; @@ -5607,9 +5607,9 @@ static inline int find_best_target(struct task_struct *p, bool boosted) int idle_idx; /* - * favor higher cpus for boosted tasks + * favor higher cpus for tasks that prefer idle cores */ - int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu; if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) continue; @@ -5634,10 +5634,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted) continue; #endif /* - * For boosted tasks we favor idle cpus unconditionally to + * Unconditionally favoring tasks that prefer idle cpus to * improve latency. */ - if (idle_cpu(i) && boosted) { + if (idle_cpu(i) && prefer_idle) { if (best_idle_cpu < 0) best_idle_cpu = i; continue; @@ -5654,7 +5654,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) target_cpu = i; target_util = new_util; } - } else if (!boosted) { + } else if (!prefer_idle) { if (best_idle_cpu < 0 || (sysctl_sched_cstate_aware && best_idle_cstate > idle_idx)) { @@ -5669,7 +5669,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted) } } - if (boosted && best_idle_cpu >= 0) + if (prefer_idle && best_idle_cpu >= 0) target_cpu = best_idle_cpu; else if (target_cpu < 0) target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; @@ -5761,14 +5761,17 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) */ #ifdef CONFIG_CGROUP_SCHEDTUNE bool boosted = schedtune_task_boost(p) > 0; + bool prefer_idle = schedtune_prefer_idle(p) > 0; #else bool boosted = 0; + bool prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted); - if (tmp_target >= 0) + int tmp_target = find_best_target(p, boosted || prefer_idle); + if (tmp_target >= 0) { target_cpu = tmp_target; - if (boosted && idle_cpu(target_cpu)) + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) return target_cpu; + } } if (target_cpu != task_cpu(p)) { diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index d24f365b0c90..644f8e9ee96f 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -125,6 +125,10 @@ struct schedtune { /* Performance Constraint (C) region threshold params */ int perf_constrain_idx; + + /* Hint to bias scheduling of tasks on that SchedTune CGroup + * towards idle CPUs */ + int prefer_idle; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -156,6 +160,7 @@ root_schedtune = { .boost = 0, .perf_boost_idx = 0, .perf_constrain_idx = 0, + .prefer_idle = 0, }; int @@ -536,6 +541,38 @@ int schedtune_task_boost(struct task_struct *p) return task_boost; } +int schedtune_prefer_idle(struct task_struct *p) +{ + struct schedtune *st; + int prefer_idle; + + /* Get prefer_idle value */ + rcu_read_lock(); + st = task_schedtune(p); + prefer_idle = st->prefer_idle; + rcu_read_unlock(); + + return prefer_idle; +} + +static u64 +prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->prefer_idle; +} + +static int +prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 prefer_idle) +{ + struct schedtune *st = css_st(css); + st->prefer_idle = prefer_idle; + + return 0; +} + static s64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -587,6 +624,11 @@ static struct cftype files[] = { .read_s64 = boost_read, .write_s64 = boost_write, }, + { + .name = "prefer_idle", + .read_u64 = prefer_idle_read, + .write_u64 = prefer_idle_write, + }, { } /* terminate */ }; diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index be1785eb1c5b..4f6441771e4c 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -17,6 +17,8 @@ struct target_nrg { int schedtune_cpu_boost(int cpu); int schedtune_task_boost(struct task_struct *tsk); +int schedtune_prefer_idle(struct task_struct *tsk); + void schedtune_exit_task(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); From 93db70f21c7bf04dfe5bb3345400da8f97c3ad29 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 10 Feb 2016 09:24:36 +0000 Subject: [PATCH 0228/3220] DEBUG: sched: add tracepoint for RD overutilized Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 20 ++++++++++++++++++++ kernel/sched/fair.c | 17 +++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 33b1c719c9d7..8006e3a46723 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -888,6 +888,26 @@ TRACE_EVENT(sched_tune_filter, __entry->payoff, __entry->region) ); +/* + * Tracepoint for system overutilized flag + */ +TRACE_EVENT(sched_overutilized, + + TP_PROTO(bool overutilized), + + TP_ARGS(overutilized), + + TP_STRUCT__entry( + __field( bool, overutilized ) + ), + + TP_fast_assign( + __entry->overutilized = overutilized; + ), + + TP_printk("overutilized=%d", + __entry->overutilized ? 1 : 0) +); #ifdef CONFIG_SCHED_WALT struct rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e099ce747345..b84277a8530d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4242,8 +4242,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && - cpu_overutilized(rq->cpu)) + cpu_overutilized(rq->cpu)) { rq->rd->overutilized = true; + trace_sched_overutilized(true); + } /* * We want to potentially trigger a freq switch @@ -7503,12 +7505,17 @@ next_group: env->dst_rq->rd->overload = overload; /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) + if (env->dst_rq->rd->overutilized != overutilized) { env->dst_rq->rd->overutilized = overutilized; + trace_sched_overutilized(overutilized); + } } else { - if (!env->dst_rq->rd->overutilized && overutilized) + if (!env->dst_rq->rd->overutilized && overutilized) { env->dst_rq->rd->overutilized = true; + trace_sched_overutilized(true); + } } + } /** @@ -8948,8 +8955,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); #ifdef CONFIG_SMP - if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { rq->rd->overutilized = true; + trace_sched_overutilized(true); + } rq->misfit_task = !task_fits_max(curr, rq->cpu); #endif From b9534b8f0128efdb00865af03106a45ce9d489cb Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 29 Jul 2016 16:09:03 +0100 Subject: [PATCH 0229/3220] FIXUP: sched/tune: do initialization as a postcore_initicall SchedTune needs to walk the scheduling domains to compute the energy normalization constants used for PE space filtering. To build such constants we need the energy model data for each CPU in the system. However, by walking the SDs as a late initcall stage, the userspace has been already initialized and it could happen that some CPUs are hotplugged out. For example, this could happen if a user-space thermal manager daemon detects that CPUs are to much hot during the boot process. To avoid such a race condition we can anticipate the SchedTune initialization code to be a postcore_initicall. This allows to keep the SchedTune initialization code as simple as an initcall while still safely relaying on SDs provided data. Such calls are executed before user-space is initialized and thus, apart from the case of unlucky early-init kernel space generated hotplugs, this solution should be safe enough to get all the data we need. Signed-off-by: Patrick Bellasi [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/tune.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 644f8e9ee96f..bd7f319ce53e 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -946,4 +946,4 @@ nodata: rcu_read_unlock(); return -EINVAL; } -late_initcall(schedtune_init); +postcore_initcall(schedtune_init); From c80a9af21aece64d3262900c78bdf39f2fdd6c2e Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Tue, 2 Aug 2016 14:05:46 -0700 Subject: [PATCH 0230/3220] sched/fair: Picking cpus with low OPPs for tasks that prefer idle CPUs When idle cpus cannot be found for Top-app/FG tasks, the cpu selection algorithm picks a cpu with lowest OPP amongst the busy cpus as a second choice. Mitigates the "runnable" time for ui and render threads. bug: 30481949 bug: 30342017 bug: 30508678 Change-Id: I5a97e31d33284895c0fa6f6942102713ee576d77 --- kernel/sched/fair.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b84277a8530d..b7b1f2759278 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5651,10 +5651,22 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if (target_util == 0 || - target_util > new_util) { - target_cpu = i; - target_util = new_util; + if(prefer_idle) { + // Find a target cpu with lowest + // utilization. + if (target_util == 0 || + target_util < new_util) { + target_cpu = i; + target_util = new_util; + } + } else { + // Find a target cpu with highest + // utilization. + if (target_util == 0 || + target_util > new_util) { + target_cpu = i; + target_util = new_util; + } } } else if (!prefer_idle) { if (best_idle_cpu < 0 || @@ -5666,6 +5678,7 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) } } else if (backup_capacity == 0 || backup_capacity > cur_capacity) { + // Find a backup cpu with least capacity. backup_capacity = cur_capacity; backup_cpu = i; } From bf93a3672c5e4c1b803958dc6c038f0b163df5cf Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 4 Aug 2016 12:20:04 +0100 Subject: [PATCH 0231/3220] sched/cpufreq_sched: fix thermal capping events cpufreq_sched_limits (called when CPUFREQ_GOV_LIMITS event happens) bails out if policy->rwsem is already locked. However, that rwsem is always guaranteed to be locked when we get here after a thermal throttling event happens: th_throttling -> cpufreq_update_policy() ... down_write(&policy->rwsem); ... cpufreq_set_policy() -> ... __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); -> cpufreq_sched_limits() ... if (!down_write_trylock(&policy->rwsem)) return; <-- BAIL OUT! So, we don't currently react immediately to thermal capping event (even if reaction is still quick in practice, ~1ms, as lots of events are likely to trigger a frequency selection on a high loaded system). Fix this bug by removing the bail out condition. While we are at it we also slightly change handling of the new limits by clamping the last requested_freq between policy's max and min. Doing so gives us the oppurtunity to correctly restore the last requested frequency as soon as a thermal unthrottling event happens. bug: 30481949 Change-Id: I3c13e818f238c1ffa66b34e419e8b87314b57427 Suggested-by: Javi Merino Signed-off-by: Juri Lelli Signed-off-by: Srinath Sridharan [jstultz: fwdported to 4.4] Signed-off-by: John Stultz --- kernel/sched/cpufreq_sched.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 3f8c67a3ea0f..4fea269a6598 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -58,7 +58,6 @@ struct gov_data { struct task_struct *task; struct irq_work irq_work; unsigned int requested_freq; - int max; }; static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, @@ -193,7 +192,7 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * gd->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; if (cpufreq_frequency_table_target(policy, policy->freq_table, freq_new, CPUFREQ_RELATION_L, &index_new)) @@ -288,8 +287,6 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->up_throttle_nsec); - gd->max = policy->max; - rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); if (rc) { pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); @@ -352,28 +349,17 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy) static void cpufreq_sched_limits(struct cpufreq_policy *policy) { - struct gov_data *gd; + unsigned int clamp_freq; + struct gov_data *gd = policy->governor_data;; pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", policy->cpu, policy->min, policy->max, policy->cur); - if (!down_write_trylock(&policy->rwsem)) - return; - /* - * Need to keep track of highest max frequency for - * capacity calculations - */ - gd = policy->governor_data; - if (gd->max < policy->max) - gd->max = policy->max; + clamp_freq = clamp(gd->requested_freq, policy->min, policy->max); - if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); - else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); - - up_write(&policy->rwsem); + if (policy->cur != clamp_freq) + __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L); } static int cpufreq_sched_stop(struct cpufreq_policy *policy) From 74b4fa8e5cfe7655adb40bc88d6d88f916cee250 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 13 May 2016 11:54:04 +0100 Subject: [PATCH 0232/3220] sched/fair: call OPP update when going idle after migration When a task leaves a rq because it is migrated away it carries its utilization with him. In this case and OPP update on the src rq might be needed. The corresponding update at dst rq will happen at enqueue time. Change-Id: I22754a43760fc8d22a488fe15044af93787ea7a8 sched/fair: Fix uninitialised variable in idle_balance compiler warned, looks legit. Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b7b1f2759278..d9af0384dbb0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8322,6 +8322,7 @@ static int idle_balance(struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; u64 curr_cost = 0; + long removed_util=0; idle_enter_fair(this_rq); @@ -8345,6 +8346,17 @@ static int idle_balance(struct rq *this_rq) raw_spin_unlock(&this_rq->lock); + /* + * If removed_util_avg is !0 we most probably migrated some task away + * from this_cpu. In this case we might be willing to trigger an OPP + * update, but we want to do so if we don't find anybody else to pull + * here (we will trigger an OPP update with the pulled task's enqueue + * anyway). + * + * Record removed_util before calling update_blocked_averages, and use + * it below (before returning) to see if an OPP update is required. + */ + removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg); update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { @@ -8409,6 +8421,12 @@ out: if (pulled_task) { idle_exit_fair(this_rq); this_rq->idle_stamp = 0; + } else if (removed_util) { + /* + * No task pulled and someone has been migrated away. + * Good case to trigger an OPP update. + */ + update_capacity_of(this_cpu); } return pulled_task; From 142b2acc79777f3cac93144d7ebf9caa53e97f9b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Jan 2016 15:21:40 -0800 Subject: [PATCH 0233/3220] vmstat: make vmstat_updater deferrable again and shut down on idle Currently the vmstat updater is not deferrable as a result of commit ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update"). This in turn can cause multiple interruptions of the applications because the vmstat updater may run at Make vmstate_update deferrable again and provide a function that folds the differentials when the processor is going to idle mode thus addressing the issue of the above commit in a clean way. Note that the shepherd thread will continue scanning the differentials from another processor and will reenable the vmstat workers if it detects any changes. Change-Id: Idf256cfacb40b4dc8dbb6795cf06b34e8fec7a06 Fixes: ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update") Signed-off-by: Christoph Lameter Cc: Michal Hocko Cc: Johannes Weiner Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Git-commit: 0eb77e9880321915322d42913c3b53241739c8aa [shashim@codeaurora.org: resolve minor merge conflicts] Signed-off-by: Shiraz Hashim [jstultz: fwdport to 4.4] Signed-off-by: John Stultz --- include/linux/vmstat.h | 2 ++ kernel/sched/idle.c | 1 + mm/vmstat.c | 69 +++++++++++++++++++++++++++--------------- 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 3e5d9075960f..73fae8c4a5fb 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -189,6 +189,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); +void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); @@ -249,6 +250,7 @@ static inline void __dec_zone_page_state(struct page *page, static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } +static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cbc130efbc5b..917c94abf5bb 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -220,6 +220,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/mm/vmstat.c b/mm/vmstat.c index c54fd2924f25..83a003bc3cae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -460,7 +460,7 @@ static int fold_diff(int *diff) * * The function returns the number of global counters updated. */ -static int refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(bool do_pagesets) { struct zone *zone; int i; @@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void) #endif } } - cond_resched(); #ifdef CONFIG_NUMA - /* - * Deal with draining the remote pageset of this - * processor - * - * Check if there are pages remaining in this pageset - * if not then there is nothing to expire. - */ - if (!__this_cpu_read(p->expire) || + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(p->expire) || !__this_cpu_read(p->pcp.count)) - continue; + continue; - /* - * We never drain zones local to this processor. - */ - if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); - continue; - } + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(p->expire, 0); + continue; + } - if (__this_cpu_dec_return(p->expire)) - continue; + if (__this_cpu_dec_return(p->expire)) + continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); - changes++; + if (__this_cpu_read(p->pcp.count)) { + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } } #endif } @@ -1386,7 +1388,7 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats()) { + if (refresh_cpu_vm_stats(true)) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1417,6 +1419,23 @@ static void vmstat_update(struct work_struct *w) } } +/* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + do { + if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + cancel_delayed_work(this_cpu_ptr(&vmstat_work)); + + } while (refresh_cpu_vm_stats(false)); +} + /* * Check if the diffs for a certain cpu indicate that * an update is needed. @@ -1449,7 +1468,7 @@ static bool need_update(int cpu) */ static void vmstat_shepherd(struct work_struct *w); -static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); +static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); static void vmstat_shepherd(struct work_struct *w) { From 07ec7db165f1990ca563e18888e82991eac1c5e5 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Fri, 29 Jul 2016 17:50:11 +0100 Subject: [PATCH 0234/3220] sched/fair: Favor higher cpus only for boosted tasks This CL separates the notion of boost and prefer_idle schedtune attributes in cpu selection. Today only top-app tasks are boosted. The CPU selection is slightly tweaked such that higher order cpus are preferred only for boosted tasks (top-app) and the rest would be skewed towards lower order cpus. This avoids starvation issues for fg tasks when interacting with high priority top-app tasks (a problem often seen in the case of system_server). bug: 30245369 bug: 30292998 Change-Id: I0377e00893b9f6586eec55632a265518fd2fa8a1 Conflicts: kernel/sched/fair.c --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d9af0384dbb0..da9323b9ecab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5591,7 +5591,7 @@ done: return target; } -static inline int find_best_target(struct task_struct *p, bool prefer_idle) +static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) { int iter_cpu; int target_cpu = -1; @@ -5609,9 +5609,9 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle) int idle_idx; /* - * favor higher cpus for tasks that prefer idle cores + * Iterate from higher cpus for boosted tasks. */ - int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu; + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) continue; @@ -5781,7 +5781,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) bool boosted = 0; bool prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted || prefer_idle); + int tmp_target = find_best_target(p, boosted, prefer_idle); if (tmp_target >= 0) { target_cpu = tmp_target; if ((boosted || prefer_idle) && idle_cpu(target_cpu)) From 3276c3d7bca25f229f396cc182d1af29366ab109 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2015 17:43:34 +0200 Subject: [PATCH 0235/3220] UPSTREAM: sched: Fix a race between __kthread_bind() and sched_setaffinity() Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY without locks, a caller might observe an old value and race with the set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo it: __kthread_bind() do_set_cpus_allowed() sched_setaffinity() if (p->flags & PF_NO_SETAFFINITIY) set_cpus_allowed_ptr() p->flags |= PF_NO_SETAFFINITY Fix the bug by putting everything under the regular scheduler locks. This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dedekind1@gmail.com Cc: juri.lelli@arm.com Cc: mgorman@suse.de Cc: riel@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20150515154833.545640346@infradead.org Signed-off-by: Ingo Molnar (cherry picked from commit 25834c73f93af7f0712c98ca4593691592e6b360) Signed-off-by: Punit Agrawal BUG=chrome-os-partner:44828 TEST=Boot kernel on Oak. TEST=smaug-release and strago-release trybots. Change-Id: Id3c898c5ee1a22ed704e83f2ecf5f78199280d38 Reviewed-on: https://chromium-review.googlesource.com/321264 Commit-Ready: Ricky Liang Tested-by: Ricky Liang Reviewed-by: Ricky Liang Conflicts: kernel/sched/core.c --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 02743ffe7ff3..40c8e8d1e07f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5157,6 +5157,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock(&rq->lock); __sched_fork(0, idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); From 989f33f789a85cc53f4c3f75e31ddfce65c6d873 Mon Sep 17 00:00:00 2001 From: Matt Wagantall Date: Tue, 17 Jun 2014 21:43:35 -0700 Subject: [PATCH 0236/3220] sched/rt: print RT tasks when RT throttling is activated Existing debug prints do not provide any clues about which tasks may have triggered RT throttling. Print the names and PIDs of all tasks on the throttled rt_rq to help narrow down the source of the problem. Change-Id: I180534c8a647254ed38e89d0c981a8f8bccd741c Signed-off-by: Matt Wagantall [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa --- kernel/sched/rt.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index be700bfa1ae4..2b9121ea91bf 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -891,6 +891,42 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } +static void dump_throttled_rt_tasks(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array = &rt_rq->active; + struct sched_rt_entity *rt_se; + char buf[500]; + char *pos = buf; + char *end = buf + sizeof(buf); + int idx; + + pos += snprintf(pos, sizeof(buf), + "sched: RT throttling activated for rt_rq %p (cpu %d)\n", + rt_rq, cpu_of(rq_of_rt_rq(rt_rq))); + + if (bitmap_empty(array->bitmap, MAX_RT_PRIO)) + goto out; + + pos += snprintf(pos, end - pos, "potential CPU hogs:\n"); + idx = sched_find_first_bit(array->bitmap); + while (idx < MAX_RT_PRIO) { + list_for_each_entry(rt_se, array->queue + idx, run_list) { + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + continue; + + p = rt_task_of(rt_se); + if (pos < end) + pos += snprintf(pos, end - pos, "\t%s (%d)\n", + p->comm, p->pid); + } + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); + } +out: + printk_deferred("%s", buf); +} + static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -914,8 +950,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { + static bool once = false; + rt_rq->rt_throttled = 1; - printk_deferred_once("sched: RT throttling activated\n"); + + if (!once) { + once = true; + dump_throttled_rt_tasks(rt_rq); + } } else { /* * In case we did anyway, make it go away, From 2a4445395f1be46db1200e1329fae06d034675e5 Mon Sep 17 00:00:00 2001 From: Matt Wagantall Date: Thu, 19 Jun 2014 14:23:33 -0700 Subject: [PATCH 0237/3220] sched/rt: Add Kconfig option to enable panicking for RT throttling This may be useful for detecting and debugging RT throttling issues. Change-Id: I5807a897d11997d76421c1fcaa2918aad988c6c9 Signed-off-by: Matt Wagantall [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa [jstultz: forwardported to 4.4] Signed-off-by: John Stultz --- kernel/sched/rt.c | 9 +++++++++ lib/Kconfig.debug | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2b9121ea91bf..8a16cba968c4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -924,7 +924,16 @@ static void dump_throttled_rt_tasks(struct rt_rq *rt_rq) idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); } out: +#ifdef CONFIG_PANIC_ON_RT_THROTTLING + /* + * Use pr_err() in the BUG() case since printk_sched() will + * not get flushed and deadlock is not a concern. + */ + pr_err("%s", buf); + BUG(); +#else printk_deferred("%s", buf); +#endif } static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8c15b29d5adc..07ccfa681577 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -855,6 +855,15 @@ config SCHED_INFO bool default n +config PANIC_ON_RT_THROTTLING + bool "Panic on RT throttling" + help + Say Y here to enable the kernel to panic when a realtime + runqueue is throttled. This may be useful for detecting + and debugging RT throttling issues. + + Say N if unsure. + config SCHEDSTATS bool "Collect scheduler statistics" depends on DEBUG_KERNEL && PROC_FS From fb8741a0ba7445503ebe648fd96cccc34d7d2088 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 1 Aug 2016 16:49:07 -0700 Subject: [PATCH 0238/3220] FROMLIST: proc: Fix timerslack_ns CAP_SYS_NICE check when adjusting self In changing from checking ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS) to capable(CAP_SYS_NICE), I missed that ptrace_my_access succeeds when p == current, but the CAP_SYS_NICE doesn't. Thus while the previous commit was intended to loosen the needed privledges to modify a processes timerslack, it needlessly restricted a task modifying its own timerslack via the proc//timerslack_ns (which is permitted also via the PR_SET_TIMERSLACK method). This patch corrects this by checking if p == current before checking the CAP_SYS_NICE value. Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: Andrew Morton Cc: Thomas Gleixner CC: Arjan van de Ven Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Todd Kjos Cc: Colin Cross Cc: Nick Kralevich Cc: Dmitry Shmidt Cc: Elliott Hughes Cc: Android Kernel Team Mailing-list-url: http://www.spinics.net/lists/kernel/msg2317488.html Change-Id: Ia3e8aff07c2d41f55b6617502d33c39b7d781aac Signed-off-by: John Stultz --- fs/proc/base.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index ac5d2aea63b5..49348349e156 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2260,15 +2260,17 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (!capable(CAP_SYS_NICE)) { - count = -EPERM; - goto out; - } + if (p != current) { + if (!capable(CAP_SYS_NICE)) { + count = -EPERM; + goto out; + } - err = security_task_setscheduler(p); - if (err) { - count = err; - goto out; + err = security_task_setscheduler(p); + if (err) { + count = err; + goto out; + } } task_lock(p); @@ -2294,14 +2296,16 @@ static int timerslack_ns_show(struct seq_file *m, void *v) if (!p) return -ESRCH; - if (!capable(CAP_SYS_NICE)) { - err = -EPERM; - goto out; - } + if (p != current) { - err = security_task_getscheduler(p); - if (err) - goto out; + if (!capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + err = security_task_getscheduler(p); + if (err) + goto out; + } task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); From ef58c0a2695707ab8b49368aa77ab7b4828aa1a7 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Fri, 12 Aug 2016 11:24:50 +0530 Subject: [PATCH 0239/3220] ANDROID: net: fib: remove duplicate assignment Remove duplicate FRA_GOTO assignment. Fixes: fd2cf795f3ab ("net: core: Support UID-based routing.") Change-Id: I462c24b16fdef42ae2332571a0b95de3ef9d2e25 Signed-off-by: Amit Pundir --- include/net/fib_rules.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 55b5419cb6a7..bdd985f41022 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -89,7 +89,6 @@ struct fib_rules_ops { [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ - [FRA_GOTO] = { .type = NLA_U32 }, \ [FRA_UID_START] = { .type = NLA_U32 }, \ [FRA_UID_END] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ From 0b848391c27294982778254abb2d16723977e839 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 11 Aug 2016 19:13:22 +0530 Subject: [PATCH 0240/3220] ANDROID: net: core: fix UID-based routing Fix RTA_UID enum to match it with the Android userspace code which assumes RTA_UID=18. With this patch all Android kernel networking unit tests mentioned here https://source.android.com/devices/tech/config/kernel_network_tests.html are success. Without this patch multinetwork_test.py unit test fails. Change-Id: I3ff36670f7d4e5bf5f01dce584ae9d53deabb3ed Signed-off-by: Amit Pundir --- include/uapi/linux/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 355eea225dd9..3eb02a1d6d8c 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -306,12 +306,12 @@ enum rtattr_type_t { RTA_TABLE, RTA_MARK, RTA_MFC_STATS, + RTA_UID, RTA_VIA, RTA_NEWDST, RTA_PREF, RTA_ENCAP_TYPE, RTA_ENCAP, - RTA_UID, __RTA_MAX }; From 1da2a42de4b9371d2988ddf6969ce5efd1560270 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:29 -0400 Subject: [PATCH 0241/3220] UPSTREAM: Revert "ecryptfs: forbid opening files without mmap handler" (cherry picked from commit 78c4e172412de5d0456dc00d2b34050aa0b683b5) This reverts upstream commit 2f36db71009304b3f0b95afacd8eba1f9f046b87. It fixed a local root exploit but also introduced a dependency on the lower file system implementing an mmap operation just to open a file, which is a bit of a heavy hammer. The right fix is to have mmap depend on the existence of the mmap handler instead. Signed-off-by: Jeff Mahoney Cc: stable@vger.kernel.org Signed-off-by: Tyler Hicks Fixes: Change-Id I0be77c7f8bd3046bc34cd87ef577529792d479bc ("UPSTREAM: ecryptfs: forbid opening files without mmap handler") Change-Id: Ib9bc87099f7f89e4e12dbc1a79e884dcadb1befb Signed-off-by: Amit Pundir --- fs/ecryptfs/kthread.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index e818f5ac7a26..866bb18efefe 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "ecryptfs_kernel.h" struct ecryptfs_open_req { @@ -148,7 +147,7 @@ int ecryptfs_privileged_open(struct file **lower_file, flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) - goto have_file; + goto out; if ((flags & O_ACCMODE) == O_RDONLY) { rc = PTR_ERR((*lower_file)); goto out; @@ -166,16 +165,8 @@ int ecryptfs_privileged_open(struct file **lower_file, mutex_unlock(&ecryptfs_kthread_ctl.mux); wake_up(&ecryptfs_kthread_ctl.wait); wait_for_completion(&req.done); - if (IS_ERR(*lower_file)) { + if (IS_ERR(*lower_file)) rc = PTR_ERR(*lower_file); - goto out; - } -have_file: - if ((*lower_file)->f_op->mmap == NULL) { - fput(*lower_file); - *lower_file = NULL; - rc = -EMEDIUMTYPE; - } out: return rc; } From 7997255b0d0ffed60ad402c5022eacf0161dcdc0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 5 Jul 2016 17:32:30 -0400 Subject: [PATCH 0242/3220] UPSTREAM: ecryptfs: don't allow mmap when the lower fs doesn't support it (cherry picked from commit f0fe970df3838c202ef6c07a4c2b36838ef0a88b) There are legitimate reasons to disallow mmap on certain files, notably in sysfs or procfs. We shouldn't emulate mmap support on file systems that don't offer support natively. CVE-2016-1583 Signed-off-by: Jeff Mahoney Cc: stable@vger.kernel.org [tyhicks: clean up f_op check by using ecryptfs_file_to_lower()] Signed-off-by: Tyler Hicks Change-Id: I66e3670771630a25b0608f10019d1584e9ce73a6 Signed-off-by: Amit Pundir --- fs/ecryptfs/file.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index feef8a9c4de7..11309683d65f 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -170,6 +170,19 @@ out: return rc; } +static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + /* + * Don't allow mmap on top of file systems that don't support it + * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs + * allows recursive mounting, this will need to be extended. + */ + if (!lower_file->f_op->mmap) + return -ENODEV; + return generic_file_mmap(file, vma); +} + /** * ecryptfs_open * @inode: inode speciying file to open @@ -364,7 +377,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, From 4804692cb18d09c49ecfce3eef2969a69859e413 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 4 May 2016 14:04:13 -0400 Subject: [PATCH 0243/3220] UPSTREAM: ecryptfs: fix handling of directory opening (cherry picked from commit 6a480a7842545ec520a91730209ec0bae41694c1) First of all, trying to open them r/w is idiocy; it's guaranteed to fail. Moreover, assigning ->f_pos and assuming that everything will work is blatantly broken - try that with e.g. tmpfs as underlying layer and watch the fireworks. There may be a non-trivial amount of state associated with current IO position, well beyond the numeric offset. Using the single struct file associated with underlying inode is really not a good idea; we ought to open one for each ecryptfs directory struct file. Additionally, file_operations both for directories and non-directories are full of pointless methods; non-directories should *not* have ->iterate(), directories should not have ->flush(), ->fasync() and ->splice_read(). Signed-off-by: Al Viro Change-Id: I4813ce803f270fdd364758ce1dc108b76eab226e Signed-off-by: Amit Pundir --- fs/ecryptfs/file.c | 71 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 11309683d65f..27794b137b24 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) .sb = inode->i_sb, }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = ctx->pos; rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; if (rc < 0) @@ -236,14 +235,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (d_is_dir(ecryptfs_dentry)) { - ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - mutex_lock(&crypt_stat->cs_mutex); - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - rc = 0; - goto out; - } rc = read_or_initialize_metadata(ecryptfs_dentry); if (rc) goto out_put; @@ -260,6 +251,45 @@ out: return rc; } +/** + * ecryptfs_dir_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + struct file *lower_file; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (unlikely(!file_info)) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + return -ENOMEM; + } + lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), + file->f_flags, current_cred()); + if (IS_ERR(lower_file)) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%ld]\n", __func__, + ecryptfs_dentry, PTR_ERR(lower_file)); + kmem_cache_free(ecryptfs_file_info_cache, file_info); + return PTR_ERR(lower_file); + } + ecryptfs_set_file_lower(file, lower_file); + return 0; +} + static int ecryptfs_flush(struct file *file, fl_owner_t td) { struct file *lower_file = ecryptfs_file_to_lower(file); @@ -280,6 +310,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file) return 0; } +static int ecryptfs_dir_release(struct inode *inode, struct file *file) +{ + fput(ecryptfs_file_to_lower(file)); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence); +} + static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -359,20 +402,16 @@ const struct file_operations ecryptfs_dir_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .open = ecryptfs_open, - .flush = ecryptfs_flush, - .release = ecryptfs_release, + .open = ecryptfs_dir_open, + .release = ecryptfs_dir_release, .fsync = ecryptfs_fsync, - .fasync = ecryptfs_fasync, - .splice_read = generic_file_splice_read, - .llseek = default_llseek, + .llseek = ecryptfs_dir_llseek, }; const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, .read_iter = ecryptfs_read_update_atime, .write_iter = generic_file_write_iter, - .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, From aa3cda16a57e73e9fc269e1de1eb3c35f3f0ed20 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 9 Aug 2016 12:47:37 -0700 Subject: [PATCH 0244/3220] ANDROID: dm-verity: adopt changes made to dm callbacks v4.4 introduced changes to the callbacks used for dm-linear and dm-verity-target targets. Move to those headers in dm-android-verity. Verified on hikey while having BOARD_USES_RECOVERY_AS_BOOT := true BOARD_BUILD_SYSTEM_ROOT_IMAGE := true BUG: 27339727 Signed-off-by: Badhri Jagan Sridharan Change-Id: Ic64950c3b55f0a6eaa570bcedc2ace83bbf3005e --- drivers/md/dm-android-verity.c | 12 +++++------- drivers/md/dm-android-verity.h | 6 ++---- drivers/md/dm-linear.c | 2 +- drivers/md/dm-verity-target.c | 2 +- drivers/md/dm-verity.h | 6 ++---- 5 files changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 1f4eb099209d..15ce2a81c1f4 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -59,8 +59,7 @@ static struct target_type android_verity_target = { .dtr = verity_dtr, .map = verity_map, .status = verity_status, - .ioctl = verity_ioctl, - .merge = verity_merge, + .prepare_ioctl = verity_prepare_ioctl, .iterate_devices = verity_iterate_devices, .io_hints = verity_io_hints, }; @@ -637,8 +636,7 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) android_verity_target.dtr = dm_linear_dtr, android_verity_target.map = dm_linear_map, android_verity_target.status = dm_linear_status, - android_verity_target.ioctl = dm_linear_ioctl, - android_verity_target.merge = dm_linear_merge, + android_verity_target.prepare_ioctl = dm_linear_prepare_ioctl, android_verity_target.iterate_devices = dm_linear_iterate_devices, android_verity_target.io_hints = NULL; @@ -676,7 +674,7 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; - u64 device_size; + u64 uninitialized_var(device_size); if (argc == 1) { /* Use the default keyid */ @@ -896,7 +894,7 @@ static int __init dm_android_verity_init(void) } file = debugfs_create_bool("target_added", S_IRUGO, debug_dir, - (u32 *)&target_added); + &target_added); if (IS_ERR_OR_NULL(file)) { DMERR("Cannot create android_verity debugfs directory: %ld", @@ -906,7 +904,7 @@ static int __init dm_android_verity_init(void) } file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir, - (u32 *)&verity_enabled); + &verity_enabled); if (IS_ERR_OR_NULL(file)) { DMERR("Cannot create android_verity debugfs directory: %ld", diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h index f43b02fbb475..0c7ff6afec69 100644 --- a/drivers/md/dm-android-verity.h +++ b/drivers/md/dm-android-verity.h @@ -113,10 +113,8 @@ extern void dm_linear_dtr(struct dm_target *ti); extern int dm_linear_map(struct dm_target *ti, struct bio *bio); extern void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen); -extern int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd, - unsigned long arg); -extern int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size); +extern int dm_linear_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode); extern int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data); extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 74caca2888a6..8505a771de42 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -116,7 +116,7 @@ void dm_linear_status(struct dm_target *ti, status_type_t type, } } -static int dm_linear_prepare_ioctl(struct dm_target *ti, +int dm_linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) { struct linear_c *lc = (struct linear_c *) ti->private; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 65835f15a116..5214ed2c7507 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -656,7 +656,7 @@ void verity_status(struct dm_target *ti, status_type_t type, } } -static int verity_prepare_ioctl(struct dm_target *ti, +int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) { struct dm_verity *v = ti->private; diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index d9cf5e4939eb..75effca400a3 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -128,10 +128,8 @@ extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, extern void verity_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen); -extern int verity_ioctl(struct dm_target *ti, unsigned cmd, - unsigned long arg); -extern int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size); +extern int verity_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, fmode_t *mode); extern int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data); extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits); From 1a0cf8ace145843e5d2af961078e0ee4eccf3588 Mon Sep 17 00:00:00 2001 From: Winter Wang Date: Wed, 27 Jul 2016 10:03:19 +0800 Subject: [PATCH 0245/3220] UPSTREAM: usb: gadget: configfs: add mutex lock before unregister gadget There may be a race condition if f_fs calls unregister_gadget_item in ffs_closed() when unregister_gadget is called by UDC store at the same time. this leads to a kernel NULL pointer dereference: [ 310.644928] Unable to handle kernel NULL pointer dereference at virtual address 00000004 [ 310.645053] init: Service 'adbd' is being killed... [ 310.658938] pgd = c9528000 [ 310.662515] [00000004] *pgd=19451831, *pte=00000000, *ppte=00000000 [ 310.669702] Internal error: Oops: 817 [#1] PREEMPT SMP ARM [ 310.675211] Modules linked in: [ 310.678294] CPU: 0 PID: 1537 Comm: ->transport Not tainted 4.1.15-03725-g793404c #2 [ 310.685958] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 310.692493] task: c8e24200 ti: c945e000 task.ti: c945e000 [ 310.697911] PC is at usb_gadget_unregister_driver+0xb4/0xd0 [ 310.703502] LR is at __mutex_lock_slowpath+0x10c/0x16c [ 310.708648] pc : [] lr : [] psr: 600f0113 [ 311.565585] [] (usb_gadget_unregister_driver) from [] (unregister_gadget_item+0x1c/0x34) [ 311.575426] [] (unregister_gadget_item) from [] (ffs_closed+0x8c/0x9c) [ 311.583702] [] (ffs_closed) from [] (ffs_data_reset+0xc/0xa0) [ 311.591194] [] (ffs_data_reset) from [] (ffs_data_closed+0x90/0xd0) [ 311.599208] [] (ffs_data_closed) from [] (ffs_ep0_release+0xc/0x14) [ 311.607224] [] (ffs_ep0_release) from [] (__fput+0x80/0x1d0) [ 311.614635] [] (__fput) from [] (task_work_run+0xb0/0xe8) [ 311.621788] [] (task_work_run) from [] (do_work_pending+0x7c/0xa4) [ 311.629718] [] (do_work_pending) from [] (work_pending+0xc/0x20) for functions using functionFS, i.e. android adbd will close /dev/usb-ffs/adb/ep0 when usb IO thread fails, but switch adb from on to off also triggers write "none" > UDC. These 2 operations both call unregister_gadget, which will lead to the panic above. add a mutex before calling unregister_gadget for api used in f_fs. Signed-off-by: Winter Wang Signed-off-by: Felipe Balbi --- drivers/usb/gadget/configfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index b27ce0747c18..8df96cb3bb58 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -1737,7 +1737,9 @@ void unregister_gadget_item(struct config_item *item) { struct gadget_info *gi = to_gadget_info(item); + mutex_lock(&gi->lock); unregister_gadget(gi); + mutex_unlock(&gi->lock); } EXPORT_SYMBOL_GPL(unregister_gadget_item); From 34828bd3e72d5f7c1d921a3d090ab6b57f6e1ca7 Mon Sep 17 00:00:00 2001 From: Ricky Liang Date: Tue, 2 Feb 2016 01:12:06 +0800 Subject: [PATCH 0246/3220] FIXUP: sched: scheduler-driven cpu frequency selection Two fixups that have been reported on LKML. The next version of scheduler-driver cpu frequency selection patch set should include these fixes and we can drop this patch then. Signed-off-by: Ricky Liang Change-Id: Ia2f8b5c0dd5dac06580256eeb4b259929688af68 --- kernel/sched/cpufreq_sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 4fea269a6598..f6f9b9b3a4a8 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -131,6 +131,8 @@ static int cpufreq_sched_thread(void *data) new_request = gd->requested_freq; if (new_request == last_request) { set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; schedule(); } else { /* @@ -293,6 +295,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) goto err; } + policy->governor_data = gd; if (cpufreq_driver_is_slow()) { cpufreq_driver_slow = true; gd->task = kthread_create(cpufreq_sched_thread, policy, @@ -309,12 +312,12 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); } - policy->governor_data = gd; set_sched_freq(); return 0; err: + policy->governor_data = NULL; kfree(gd); return -ENOMEM; } From 76554612b462f53e50f4a40ba80efef90ac7920a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Nov 2015 14:09:38 -0500 Subject: [PATCH 0247/3220] sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats() Part of the responsibility of the update_sg_lb_stats() function is to update the idle_cpus statistical counter in struct sg_lb_stats. This check is done by calling idle_cpu(). The idle_cpu() function, in turn, checks a number of fields within the run queue structure such as rq->curr and rq->nr_running. With the current layout of the run queue structure, rq->curr and rq->nr_running are in separate cachelines. The rq->curr variable is checked first followed by nr_running. As nr_running is also accessed by update_sg_lb_stats() earlier, it makes no sense to load another cacheline when nr_running is not 0 as idle_cpu() will always return false in this case. This patch eliminates this redundant cacheline load by checking the cached nr_running before calling idle_cpu(). Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Douglas Hatch Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1448478580-26467-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar (cherry picked from commit a426f99c91d1036767a7819aaaba6bd3191b7f06) Signed-off-by: Javi Merino --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da9323b9ecab..4a7d9d78ce6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7287,7 +7287,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *overload, bool *overutilized) { unsigned long load; - int i; + int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -7304,7 +7304,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; - if (rq->nr_running > 1) + nr_running = rq->nr_running; + if (nr_running > 1) *overload = true; #ifdef CONFIG_NUMA_BALANCING @@ -7312,7 +7313,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_preferred_running += rq->nr_preferred_running; #endif sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; if (cpu_overutilized(i)) { From ca9b7b070b5833b9a7f4282df8a8484e5498630f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 10 Jul 2016 10:04:02 +0200 Subject: [PATCH 0248/3220] UPSTREAM: tcp: make challenge acks less predictable (cherry picked from commit 75ff39ccc1bd5d3c455b6822ab09e533c551f758) Yue Cao claims that current host rate limiting of challenge ACKS (RFC 5961) could leak enough information to allow a patient attacker to hijack TCP sessions. He will soon provide details in an academic paper. This patch increases the default limit from 100 to 1000, and adds some randomization so that the attacker can no longer hijack sessions without spending a considerable amount of probes. Based on initial analysis and patch from Linus. Note that we also have per socket rate limiting, so it is tempting to remove the host limit in the future. v2: randomize the count of challenge acks per second, not the period. Fixes: 282f23c6ee34 ("tcp: implement RFC 5961 3.2") Reported-by: Yue Cao Signed-off-by: Eric Dumazet Suggested-by: Linus Torvalds Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Change-Id: Ib46ba66f5e4a5a7c81bfccd7b0aa83c3d9e1b3bb Bug: 30809774 --- net/ipv4/tcp_input.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index da34f830f4bc..2014a701663a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -89,7 +89,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); /* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 100; +int sysctl_tcp_challenge_ack_limit = 1000; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; @@ -3428,7 +3428,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); - u32 now; + u32 count, now; /* First check our per-socket dupack rate limit. */ if (tcp_oow_rate_limited(sock_net(sk), skb, @@ -3436,13 +3436,18 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) &tp->last_oow_ack_time)) return; - /* Then check the check host-wide RFC 5961 rate limit. */ + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { + u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + challenge_timestamp = now; - challenge_count = 0; + WRITE_ONCE(challenge_count, half + + prandom_u32_max(sysctl_tcp_challenge_ack_limit)); } - if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { + count = READ_ONCE(challenge_count); + if (count > 0) { + WRITE_ONCE(challenge_count, count - 1); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } From a20900c4aa3596f6318568a662c71e0943b2bbd9 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 13 Jan 2016 17:48:01 +0100 Subject: [PATCH 0249/3220] UPSTREAM: ALSA: timer: Fix race among timer ioctls (cherry picked from commit af368027a49a751d6ff4ee9e3f9961f35bb4fede) ALSA timer ioctls have an open race and this may lead to a use-after-free of timer instance object. A simplistic fix is to make each ioctl exclusive. We have already tread_sem for controlling the tread, and extend this as a global mutex to be applied to each ioctl. The downside is, of course, the worse concurrency. But these ioctls aren't to be parallel accessible, in anyway, so it should be fine to serialize there. Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Cc: Signed-off-by: Takashi Iwai Change-Id: I1ac52f1cba5e7408fd88c8fc1c30ca2e83967ebb Bug: 28694392 --- sound/core/timer.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/sound/core/timer.c b/sound/core/timer.c index 1701703b8d93..f19fc67413e6 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -73,7 +73,7 @@ struct snd_timer_user { struct timespec tstamp; /* trigger tstamp */ wait_queue_head_t qchange_sleep; struct fasync_struct *fasync; - struct mutex tread_sem; + struct mutex ioctl_lock; }; /* list of timers */ @@ -1265,7 +1265,7 @@ static int snd_timer_user_open(struct inode *inode, struct file *file) return -ENOMEM; spin_lock_init(&tu->qlock); init_waitqueue_head(&tu->qchange_sleep); - mutex_init(&tu->tread_sem); + mutex_init(&tu->ioctl_lock); tu->ticks = 1; tu->queue_size = 128; tu->queue = kmalloc(tu->queue_size * sizeof(struct snd_timer_read), @@ -1285,8 +1285,10 @@ static int snd_timer_user_release(struct inode *inode, struct file *file) if (file->private_data) { tu = file->private_data; file->private_data = NULL; + mutex_lock(&tu->ioctl_lock); if (tu->timeri) snd_timer_close(tu->timeri); + mutex_unlock(&tu->ioctl_lock); kfree(tu->queue); kfree(tu->tqueue); kfree(tu); @@ -1524,7 +1526,6 @@ static int snd_timer_user_tselect(struct file *file, int err = 0; tu = file->private_data; - mutex_lock(&tu->tread_sem); if (tu->timeri) { snd_timer_close(tu->timeri); tu->timeri = NULL; @@ -1568,7 +1569,6 @@ static int snd_timer_user_tselect(struct file *file, } __err: - mutex_unlock(&tu->tread_sem); return err; } @@ -1782,7 +1782,7 @@ enum { SNDRV_TIMER_IOCTL_PAUSE_OLD = _IO('T', 0x23), }; -static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, +static long __snd_timer_user_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_timer_user *tu; @@ -1799,17 +1799,11 @@ static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, { int xarg; - mutex_lock(&tu->tread_sem); - if (tu->timeri) { /* too late */ - mutex_unlock(&tu->tread_sem); + if (tu->timeri) /* too late */ return -EBUSY; - } - if (get_user(xarg, p)) { - mutex_unlock(&tu->tread_sem); + if (get_user(xarg, p)) return -EFAULT; - } tu->tread = xarg ? 1 : 0; - mutex_unlock(&tu->tread_sem); return 0; } case SNDRV_TIMER_IOCTL_GINFO: @@ -1842,6 +1836,18 @@ static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, return -ENOTTY; } +static long snd_timer_user_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct snd_timer_user *tu = file->private_data; + long ret; + + mutex_lock(&tu->ioctl_lock); + ret = __snd_timer_user_ioctl(file, cmd, arg); + mutex_unlock(&tu->ioctl_lock); + return ret; +} + static int snd_timer_user_fasync(int fd, struct file * file, int on) { struct snd_timer_user *tu; From 50acf9e6e34a14148d6a8c715c79b1cb013f4f3d Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Thu, 11 Feb 2016 19:37:27 +0000 Subject: [PATCH 0250/3220] UPSTREAM: af_unix: Guard against other == sk in unix_dgram_sendmsg (cherry picked from commit a5527dda344fff0514b7989ef7a755729769daa1) The unix_dgram_sendmsg routine use the following test if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { to determine if sk and other are in an n:1 association (either established via connect or by using sendto to send messages to an unrelated socket identified by address). This isn't correct as the specified address could have been bound to the sending socket itself or because this socket could have been connected to itself by the time of the unix_peer_get but disconnected before the unix_state_lock(other). In both cases, the if-block would be entered despite other == sk which might either block the sender unintentionally or lead to trying to unlock the same spin lock twice for a non-blocking send. Add a other != sk check to guard against this. Fixes: 7d267278a9ec ("unix: avoid use-after-free in ep_remove_wait_queue") Reported-By: Philipp Hahn Signed-off-by: Rainer Weikusat Tested-by: Philipp Hahn Signed-off-by: David S. Miller Change-Id: I4ebef6a390df3487903b166b837e34c653e01cb2 Signed-off-by: Amit Pundir --- net/unix/af_unix.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ef05cd9403d4..8badff627fef 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1765,7 +1765,12 @@ restart_locked: goto out_unlock; } - if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + /* other == sk && unix_peer(other) != sk if + * - unix_peer(sk) == NULL, destination address bound to sk + * - unix_peer(sk) == sk by time of get but disconnected before lock + */ + if (other != sk && + unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); From 8d525c512280bb7d8218fd59c04de985b1886eca Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Tue, 16 Aug 2016 15:51:34 -0700 Subject: [PATCH 0251/3220] Android: MMC/UFS IO Latency Histograms. This patch adds a new sysfs node (latency_hist) and reports IO (svc time) latency histograms. Disabled by default, can be enabled by echoing 0 into latency_hist, stats can be cleared by writing 2 into latency_hist. Bug: 30677035 Change-Id: I625938135ea33e6e87cf6af1fc7edc136d8b4b32 Signed-off-by: Mohan Srinivasan --- block/blk-core.c | 80 ++++++++++++++++++++++++++++++++++++++ drivers/mmc/core/core.c | 66 ++++++++++++++++++++++++++++++- drivers/mmc/core/host.c | 6 ++- drivers/mmc/core/host.h | 5 +++ drivers/scsi/ufs/ufshcd.c | 81 +++++++++++++++++++++++++++++++++++++++ drivers/scsi/ufs/ufshcd.h | 3 ++ include/linux/blkdev.h | 76 ++++++++++++++++++++++++++++++++++++ include/linux/mmc/core.h | 2 + include/linux/mmc/host.h | 4 ++ 9 files changed, 320 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 33e2f62d5062..f95413fb7f2b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3539,3 +3539,83 @@ int __init blk_dev_init(void) return 0; } + +/* + * Blk IO latency support. We want this to be as cheap as possible, so doing + * this lockless (and avoiding atomics), a few off by a few errors in this + * code is not harmful, and we don't want to do anything that is + * perf-impactful. + * TODO : If necessary, we can make the histograms per-cpu and aggregate + * them when printing them out. + */ +void +blk_zero_latency_hist(struct io_latency_state *s) +{ + memset(s->latency_y_axis_read, 0, + sizeof(s->latency_y_axis_read)); + memset(s->latency_y_axis_write, 0, + sizeof(s->latency_y_axis_write)); + s->latency_reads_elems = 0; + s->latency_writes_elems = 0; +} + +ssize_t +blk_latency_hist_show(struct io_latency_state *s, char *buf) +{ + int i; + int bytes_written = 0; + int pct, num_elem; + u_int64_t elem; + + num_elem = s->latency_reads_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Read Latency Histogram (n = %d):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_read[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_read[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + num_elem = s->latency_writes_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Write Latency Histogram (n = %d):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_write[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_write[i]; + pct = (elem * 100) / num_elem; + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + return bytes_written; +} diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 9fab52559a8c..b503249950df 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -183,6 +183,17 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); + if (mrq->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + mrq->io_start); + blk_update_latency_hist(&host->io_lat_s, + (mrq->data->flags & MMC_DATA_READ), + delta_us); + } trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -627,6 +638,11 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { + if (host->latency_hist_enabled) { + areq->mrq->io_start = ktime_get(); + areq->mrq->lat_hist_enabled = 1; + } else + areq->mrq->lat_hist_enabled = 0; trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -1964,7 +1980,7 @@ void mmc_init_erase(struct mmc_card *card) } static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, - unsigned int arg, unsigned int qty) + unsigned int arg, unsigned int qty) { unsigned int erase_timeout; @@ -2907,6 +2923,54 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } +static ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + + return blk_latency_hist_show(&host->io_lat_s, buf); +} + +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&host->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + host->latency_hist_enabled = value; + return count; +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +void +mmc_latency_hist_sysfs_init(struct mmc_host *host) +{ + if (device_create_file(&host->class_dev, &dev_attr_latency_hist)) + dev_err(&host->class_dev, + "Failed to create latency_hist sysfs entry\n"); +} + +void +mmc_latency_hist_sysfs_exit(struct mmc_host *host) +{ + device_remove_file(&host->class_dev, &dev_attr_latency_hist); +} + subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index fcf7829c759e..17068839c74b 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -32,8 +32,6 @@ #include "slot-gpio.h" #include "pwrseq.h" -#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) - static DEFINE_IDR(mmc_host_idr); static DEFINE_SPINLOCK(mmc_host_lock); @@ -394,6 +392,8 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif + mmc_latency_hist_sysfs_init(host); + mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) register_pm_notifier(&host->pm_notify); @@ -422,6 +422,8 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif + mmc_latency_hist_sysfs_exit(host); + device_del(&host->class_dev); led_trigger_unregister_simple(host->led); diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index 992bf5397633..bf38533406fd 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -12,6 +12,8 @@ #define _MMC_CORE_HOST_H #include +#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) + int mmc_register_host_class(void); void mmc_unregister_host_class(void); @@ -21,5 +23,8 @@ void mmc_retune_hold(struct mmc_host *host); void mmc_retune_release(struct mmc_host *host); int mmc_retune(struct mmc_host *host); +void mmc_latency_hist_sysfs_init(struct mmc_host *host); +void mmc_latency_hist_sysfs_exit(struct mmc_host *host); + #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 85cd2564c157..4167bdbf0ecf 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -39,6 +39,7 @@ #include #include +#include #include "ufshcd.h" #include "unipro.h" @@ -1332,6 +1333,17 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) clear_bit_unlock(tag, &hba->lrb_in_use); goto out; } + + /* IO svc time latency histogram */ + if (hba != NULL && cmd->request != NULL) { + if (hba->latency_hist_enabled && + (cmd->request->cmd_type == REQ_TYPE_FS)) { + cmd->request->lat_hist_io_start = ktime_get(); + cmd->request->lat_hist_enabled = 1; + } else + cmd->request->lat_hist_enabled = 0; + } + WARN_ON(hba->clk_gating.state != CLKS_ON); lrbp = &hba->lrb[tag]; @@ -3160,6 +3172,7 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) u32 tr_doorbell; int result; int index; + struct request *req; /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. @@ -3184,6 +3197,22 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) /* Mark completed command as NULL in LRB */ lrbp->cmd = NULL; clear_bit_unlock(index, &hba->lrb_in_use); + req = cmd->request; + if (req) { + /* Update IO svc time latency histogram */ + if (req->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + req->lat_hist_io_start); + /* rq_data_dir() => true if WRITE */ + blk_update_latency_hist(&hba->io_lat_s, + (rq_data_dir(req) == READ), + delta_us); + } + } /* Do not touch lrbp after scsi done */ cmd->scsi_done(cmd); __ufshcd_release(hba); @@ -5327,6 +5356,54 @@ out: } EXPORT_SYMBOL(ufshcd_shutdown); +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&hba->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + hba->latency_hist_enabled = value; + return count; +} + +ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return blk_latency_hist_show(&hba->io_lat_s, buf); +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +static void +ufshcd_init_latency_hist(struct ufs_hba *hba) +{ + if (device_create_file(hba->dev, &dev_attr_latency_hist)) + dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n"); +} + +static void +ufshcd_exit_latency_hist(struct ufs_hba *hba) +{ + device_create_file(hba->dev, &dev_attr_latency_hist); +} + /** * ufshcd_remove - de-allocate SCSI host and host memory space * data structure memory @@ -5342,6 +5419,7 @@ void ufshcd_remove(struct ufs_hba *hba) scsi_host_put(hba->host); ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); if (ufshcd_is_clkscaling_enabled(hba)) devfreq_remove_device(hba->devfreq); ufshcd_hba_exit(hba); @@ -5639,6 +5717,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Hold auto suspend until async scan completes */ pm_runtime_get_sync(dev); + ufshcd_init_latency_hist(hba); + /* * The device-initialize-sequence hasn't been invoked yet. * Set the device to power-off state @@ -5653,6 +5733,7 @@ out_remove_scsi_host: scsi_remove_host(hba->host); exit_gating: ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); out_disable: hba->is_irq_enabled = false; scsi_host_put(host); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 2570d9477b37..f3780cf7d895 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -532,6 +532,9 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; bool is_sys_suspended; + + int latency_hist_enabled; + struct io_latency_state io_lat_s; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c70e3588a48c..faf2c0093527 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -197,6 +197,9 @@ struct request { /* for bidi */ struct request *next_rq; + + ktime_t lat_hist_io_start; + int lat_hist_enabled; }; static inline unsigned short req_get_ioprio(struct request *req) @@ -1645,6 +1648,79 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); extern long bdev_direct_access(struct block_device *, sector_t, void __pmem **addr, unsigned long *pfn, long size); + +/* + * X-axis for IO latency histogram support. + */ +static const u_int64_t latency_x_axis_us[] = { + 100, + 200, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1200, + 1400, + 1600, + 1800, + 2000, + 2500, + 3000, + 4000, + 5000, + 6000, + 7000, + 9000, + 10000 +}; + +#define BLK_IO_LAT_HIST_DISABLE 0 +#define BLK_IO_LAT_HIST_ENABLE 1 +#define BLK_IO_LAT_HIST_ZERO 2 + +struct io_latency_state { + u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_reads_elems; + u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_writes_elems; +}; + +static inline void +blk_update_latency_hist(struct io_latency_state *s, + int read, + u_int64_t delta_us) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { + if (delta_us < (u_int64_t)latency_x_axis_us[i]) { + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + break; + } + } + if (i == ARRAY_SIZE(latency_x_axis_us)) { + /* Overflowed the histogram */ + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + } + if (read) + s->latency_reads_elems++; + else + s->latency_writes_elems++; +} + +void blk_zero_latency_hist(struct io_latency_state *s); +ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); + #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 37967b6da03c..3349f0676acb 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -136,6 +136,8 @@ struct mmc_request { struct completion completion; void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; + ktime_t io_start; + int lat_hist_enabled; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 40025b28c1fb..e4862f7cdede 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -379,6 +380,9 @@ struct mmc_host { } embedded_sdio_data; #endif + int latency_hist_enabled; + struct io_latency_state io_lat_s; + unsigned long private[0] ____cacheline_aligned; }; From 1c6df5fdcb19a03fb6b652090b4b968a6377fb14 Mon Sep 17 00:00:00 2001 From: Mohamad Ayyash Date: Thu, 25 Aug 2016 00:59:21 +0000 Subject: [PATCH 0252/3220] Revert "Android: MMC/UFS IO Latency Histograms." This reverts commit 8d525c512280bb7d8218fd59c04de985b1886eca. Change-Id: I69350b98d9de9b1c9f591e03a90f133e328ba72a --- block/blk-core.c | 80 -------------------------------------- drivers/mmc/core/core.c | 66 +------------------------------ drivers/mmc/core/host.c | 6 +-- drivers/mmc/core/host.h | 5 --- drivers/scsi/ufs/ufshcd.c | 81 --------------------------------------- drivers/scsi/ufs/ufshcd.h | 3 -- include/linux/blkdev.h | 76 ------------------------------------ include/linux/mmc/core.h | 2 - include/linux/mmc/host.h | 4 -- 9 files changed, 3 insertions(+), 320 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index f95413fb7f2b..33e2f62d5062 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3539,83 +3539,3 @@ int __init blk_dev_init(void) return 0; } - -/* - * Blk IO latency support. We want this to be as cheap as possible, so doing - * this lockless (and avoiding atomics), a few off by a few errors in this - * code is not harmful, and we don't want to do anything that is - * perf-impactful. - * TODO : If necessary, we can make the histograms per-cpu and aggregate - * them when printing them out. - */ -void -blk_zero_latency_hist(struct io_latency_state *s) -{ - memset(s->latency_y_axis_read, 0, - sizeof(s->latency_y_axis_read)); - memset(s->latency_y_axis_write, 0, - sizeof(s->latency_y_axis_write)); - s->latency_reads_elems = 0; - s->latency_writes_elems = 0; -} - -ssize_t -blk_latency_hist_show(struct io_latency_state *s, char *buf) -{ - int i; - int bytes_written = 0; - int pct, num_elem; - u_int64_t elem; - - num_elem = s->latency_reads_elems; - if (num_elem > 0) { - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "IO svc_time Read Latency Histogram (n = %d):\n", - num_elem); - for (i = 0; - i < ARRAY_SIZE(latency_x_axis_us); - i++) { - elem = s->latency_y_axis_read[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t< %5lluus%15llu%15d%%\n", - latency_x_axis_us[i], - elem, pct); - } - /* Last element in y-axis table is overflow */ - elem = s->latency_y_axis_read[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t> %5dms%15llu%15d%%\n", 10, - elem, pct); - } - num_elem = s->latency_writes_elems; - if (num_elem > 0) { - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "IO svc_time Write Latency Histogram (n = %d):\n", - num_elem); - for (i = 0; - i < ARRAY_SIZE(latency_x_axis_us); - i++) { - elem = s->latency_y_axis_write[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t< %5lluus%15llu%15d%%\n", - latency_x_axis_us[i], - elem, pct); - } - /* Last element in y-axis table is overflow */ - elem = s->latency_y_axis_write[i]; - pct = (elem * 100) / num_elem; - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t> %5dms%15llu%15d%%\n", 10, - elem, pct); - } - return bytes_written; -} diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index b503249950df..9fab52559a8c 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -183,17 +183,6 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); - if (mrq->lat_hist_enabled) { - ktime_t completion; - u_int64_t delta_us; - - completion = ktime_get(); - delta_us = ktime_us_delta(completion, - mrq->io_start); - blk_update_latency_hist(&host->io_lat_s, - (mrq->data->flags & MMC_DATA_READ), - delta_us); - } trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -638,11 +627,6 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { - if (host->latency_hist_enabled) { - areq->mrq->io_start = ktime_get(); - areq->mrq->lat_hist_enabled = 1; - } else - areq->mrq->lat_hist_enabled = 0; trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -1980,7 +1964,7 @@ void mmc_init_erase(struct mmc_card *card) } static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, - unsigned int arg, unsigned int qty) + unsigned int arg, unsigned int qty) { unsigned int erase_timeout; @@ -2923,54 +2907,6 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } -static ssize_t -latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct mmc_host *host = cls_dev_to_mmc_host(dev); - - return blk_latency_hist_show(&host->io_lat_s, buf); -} - -/* - * Values permitted 0, 1, 2. - * 0 -> Disable IO latency histograms (default) - * 1 -> Enable IO latency histograms - * 2 -> Zero out IO latency histograms - */ -static ssize_t -latency_hist_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct mmc_host *host = cls_dev_to_mmc_host(dev); - long value; - - if (kstrtol(buf, 0, &value)) - return -EINVAL; - if (value == BLK_IO_LAT_HIST_ZERO) - blk_zero_latency_hist(&host->io_lat_s); - else if (value == BLK_IO_LAT_HIST_ENABLE || - value == BLK_IO_LAT_HIST_DISABLE) - host->latency_hist_enabled = value; - return count; -} - -static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, - latency_hist_show, latency_hist_store); - -void -mmc_latency_hist_sysfs_init(struct mmc_host *host) -{ - if (device_create_file(&host->class_dev, &dev_attr_latency_hist)) - dev_err(&host->class_dev, - "Failed to create latency_hist sysfs entry\n"); -} - -void -mmc_latency_hist_sysfs_exit(struct mmc_host *host) -{ - device_remove_file(&host->class_dev, &dev_attr_latency_hist); -} - subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 17068839c74b..fcf7829c759e 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -32,6 +32,8 @@ #include "slot-gpio.h" #include "pwrseq.h" +#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) + static DEFINE_IDR(mmc_host_idr); static DEFINE_SPINLOCK(mmc_host_lock); @@ -392,8 +394,6 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif - mmc_latency_hist_sysfs_init(host); - mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) register_pm_notifier(&host->pm_notify); @@ -422,8 +422,6 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif - mmc_latency_hist_sysfs_exit(host); - device_del(&host->class_dev); led_trigger_unregister_simple(host->led); diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index bf38533406fd..992bf5397633 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -12,8 +12,6 @@ #define _MMC_CORE_HOST_H #include -#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) - int mmc_register_host_class(void); void mmc_unregister_host_class(void); @@ -23,8 +21,5 @@ void mmc_retune_hold(struct mmc_host *host); void mmc_retune_release(struct mmc_host *host); int mmc_retune(struct mmc_host *host); -void mmc_latency_hist_sysfs_init(struct mmc_host *host); -void mmc_latency_hist_sysfs_exit(struct mmc_host *host); - #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 4167bdbf0ecf..85cd2564c157 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -39,7 +39,6 @@ #include #include -#include #include "ufshcd.h" #include "unipro.h" @@ -1333,17 +1332,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) clear_bit_unlock(tag, &hba->lrb_in_use); goto out; } - - /* IO svc time latency histogram */ - if (hba != NULL && cmd->request != NULL) { - if (hba->latency_hist_enabled && - (cmd->request->cmd_type == REQ_TYPE_FS)) { - cmd->request->lat_hist_io_start = ktime_get(); - cmd->request->lat_hist_enabled = 1; - } else - cmd->request->lat_hist_enabled = 0; - } - WARN_ON(hba->clk_gating.state != CLKS_ON); lrbp = &hba->lrb[tag]; @@ -3172,7 +3160,6 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) u32 tr_doorbell; int result; int index; - struct request *req; /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. @@ -3197,22 +3184,6 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) /* Mark completed command as NULL in LRB */ lrbp->cmd = NULL; clear_bit_unlock(index, &hba->lrb_in_use); - req = cmd->request; - if (req) { - /* Update IO svc time latency histogram */ - if (req->lat_hist_enabled) { - ktime_t completion; - u_int64_t delta_us; - - completion = ktime_get(); - delta_us = ktime_us_delta(completion, - req->lat_hist_io_start); - /* rq_data_dir() => true if WRITE */ - blk_update_latency_hist(&hba->io_lat_s, - (rq_data_dir(req) == READ), - delta_us); - } - } /* Do not touch lrbp after scsi done */ cmd->scsi_done(cmd); __ufshcd_release(hba); @@ -5356,54 +5327,6 @@ out: } EXPORT_SYMBOL(ufshcd_shutdown); -/* - * Values permitted 0, 1, 2. - * 0 -> Disable IO latency histograms (default) - * 1 -> Enable IO latency histograms - * 2 -> Zero out IO latency histograms - */ -static ssize_t -latency_hist_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ufs_hba *hba = dev_get_drvdata(dev); - long value; - - if (kstrtol(buf, 0, &value)) - return -EINVAL; - if (value == BLK_IO_LAT_HIST_ZERO) - blk_zero_latency_hist(&hba->io_lat_s); - else if (value == BLK_IO_LAT_HIST_ENABLE || - value == BLK_IO_LAT_HIST_DISABLE) - hba->latency_hist_enabled = value; - return count; -} - -ssize_t -latency_hist_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct ufs_hba *hba = dev_get_drvdata(dev); - - return blk_latency_hist_show(&hba->io_lat_s, buf); -} - -static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, - latency_hist_show, latency_hist_store); - -static void -ufshcd_init_latency_hist(struct ufs_hba *hba) -{ - if (device_create_file(hba->dev, &dev_attr_latency_hist)) - dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n"); -} - -static void -ufshcd_exit_latency_hist(struct ufs_hba *hba) -{ - device_create_file(hba->dev, &dev_attr_latency_hist); -} - /** * ufshcd_remove - de-allocate SCSI host and host memory space * data structure memory @@ -5419,7 +5342,6 @@ void ufshcd_remove(struct ufs_hba *hba) scsi_host_put(hba->host); ufshcd_exit_clk_gating(hba); - ufshcd_exit_latency_hist(hba); if (ufshcd_is_clkscaling_enabled(hba)) devfreq_remove_device(hba->devfreq); ufshcd_hba_exit(hba); @@ -5717,8 +5639,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Hold auto suspend until async scan completes */ pm_runtime_get_sync(dev); - ufshcd_init_latency_hist(hba); - /* * The device-initialize-sequence hasn't been invoked yet. * Set the device to power-off state @@ -5733,7 +5653,6 @@ out_remove_scsi_host: scsi_remove_host(hba->host); exit_gating: ufshcd_exit_clk_gating(hba); - ufshcd_exit_latency_hist(hba); out_disable: hba->is_irq_enabled = false; scsi_host_put(host); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index f3780cf7d895..2570d9477b37 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -532,9 +532,6 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; bool is_sys_suspended; - - int latency_hist_enabled; - struct io_latency_state io_lat_s; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index faf2c0093527..c70e3588a48c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -197,9 +197,6 @@ struct request { /* for bidi */ struct request *next_rq; - - ktime_t lat_hist_io_start; - int lat_hist_enabled; }; static inline unsigned short req_get_ioprio(struct request *req) @@ -1648,79 +1645,6 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); extern long bdev_direct_access(struct block_device *, sector_t, void __pmem **addr, unsigned long *pfn, long size); - -/* - * X-axis for IO latency histogram support. - */ -static const u_int64_t latency_x_axis_us[] = { - 100, - 200, - 300, - 400, - 500, - 600, - 700, - 800, - 900, - 1000, - 1200, - 1400, - 1600, - 1800, - 2000, - 2500, - 3000, - 4000, - 5000, - 6000, - 7000, - 9000, - 10000 -}; - -#define BLK_IO_LAT_HIST_DISABLE 0 -#define BLK_IO_LAT_HIST_ENABLE 1 -#define BLK_IO_LAT_HIST_ZERO 2 - -struct io_latency_state { - u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; - u_int64_t latency_reads_elems; - u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; - u_int64_t latency_writes_elems; -}; - -static inline void -blk_update_latency_hist(struct io_latency_state *s, - int read, - u_int64_t delta_us) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { - if (delta_us < (u_int64_t)latency_x_axis_us[i]) { - if (read) - s->latency_y_axis_read[i]++; - else - s->latency_y_axis_write[i]++; - break; - } - } - if (i == ARRAY_SIZE(latency_x_axis_us)) { - /* Overflowed the histogram */ - if (read) - s->latency_y_axis_read[i]++; - else - s->latency_y_axis_write[i]++; - } - if (read) - s->latency_reads_elems++; - else - s->latency_writes_elems++; -} - -void blk_zero_latency_hist(struct io_latency_state *s); -ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); - #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 3349f0676acb..37967b6da03c 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -136,8 +136,6 @@ struct mmc_request { struct completion completion; void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; - ktime_t io_start; - int lat_hist_enabled; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e4862f7cdede..40025b28c1fb 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -380,9 +379,6 @@ struct mmc_host { } embedded_sdio_data; #endif - int latency_hist_enabled; - struct io_latency_state io_lat_s; - unsigned long private[0] ____cacheline_aligned; }; From 8a58605e4661d18b4e09f1385fe22b1e83cd2cb6 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 23 Aug 2016 11:32:37 -0700 Subject: [PATCH 0253/3220] ANDROID: dm: android-verity: Allow android-verity to be compiled as an independent module Exports the device mapper callbacks of linear and dm-verity-target methods. Signed-off-by: Badhri Jagan Sridharan Change-Id: I0358be0615c431dce3cc78575aaac4ccfe3aacd7 --- drivers/md/Kconfig | 3 ++- drivers/md/Makefile | 5 +---- drivers/md/dm-linear.c | 6 ++++++ drivers/md/dm-verity-target.c | 7 +++++++ 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 96b419b544ed..6035794bc1f2 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -501,7 +501,7 @@ config DM_LOG_WRITES If unsure, say N. config DM_ANDROID_VERITY - bool "Android verity target support" + tristate "Android verity target support" depends on DM_VERITY depends on X509_CERTIFICATE_PARSER depends on SYSTEM_TRUSTED_KEYRING @@ -509,6 +509,7 @@ config DM_ANDROID_VERITY depends on KEYS depends on ASYMMETRIC_KEY_TYPE depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE + depends on MD_LINEAR ---help--- This device-mapper target is virtually a VERITY target. This target is setup by reading the metadata contents piggybacked diff --git a/drivers/md/Makefile b/drivers/md/Makefile index c8fb00d8cc36..32b5d0a90d60 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -60,6 +60,7 @@ obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o +obj-$(CONFIG_DM_ANDROID_VERITY) += dm-android-verity.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o @@ -68,7 +69,3 @@ endif ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif - -ifeq ($(CONFIG_DM_ANDROID_VERITY),y) -dm-verity-objs += dm-android-verity.o -endif diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 8505a771de42..2ff5f32a4b99 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -66,6 +66,7 @@ int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) kfree(lc); return ret; } +EXPORT_SYMBOL_GPL(dm_linear_ctr); void dm_linear_dtr(struct dm_target *ti) { @@ -74,6 +75,7 @@ void dm_linear_dtr(struct dm_target *ti) dm_put_device(ti, lc->dev); kfree(lc); } +EXPORT_SYMBOL_GPL(dm_linear_dtr); static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) { @@ -98,6 +100,7 @@ int dm_linear_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +EXPORT_SYMBOL_GPL(dm_linear_map); void dm_linear_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) @@ -115,6 +118,7 @@ void dm_linear_status(struct dm_target *ti, status_type_t type, break; } } +EXPORT_SYMBOL_GPL(dm_linear_status); int dm_linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) @@ -132,6 +136,7 @@ int dm_linear_prepare_ioctl(struct dm_target *ti, return 1; return 0; } +EXPORT_SYMBOL_GPL(dm_linear_prepare_ioctl); int dm_linear_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -140,6 +145,7 @@ int dm_linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } +EXPORT_SYMBOL_GPL(dm_linear_iterate_devices); static struct target_type linear_target = { .name = "linear", diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5214ed2c7507..9d3d4b297201 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -592,6 +592,7 @@ int verity_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +EXPORT_SYMBOL_GPL(verity_map); /* * Status: V (valid) or C (corruption found) @@ -655,6 +656,7 @@ void verity_status(struct dm_target *ti, status_type_t type, break; } } +EXPORT_SYMBOL_GPL(verity_status); int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, fmode_t *mode) @@ -668,6 +670,7 @@ int verity_prepare_ioctl(struct dm_target *ti, return 1; return 0; } +EXPORT_SYMBOL_GPL(verity_prepare_ioctl); int verity_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -676,6 +679,7 @@ int verity_iterate_devices(struct dm_target *ti, return fn(ti, v->data_dev, v->data_start, ti->len, data); } +EXPORT_SYMBOL_GPL(verity_iterate_devices); void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) { @@ -689,6 +693,7 @@ void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, limits->logical_block_size); } +EXPORT_SYMBOL_GPL(verity_io_hints); void verity_dtr(struct dm_target *ti) { @@ -719,6 +724,7 @@ void verity_dtr(struct dm_target *ti) kfree(v); } +EXPORT_SYMBOL_GPL(verity_dtr); static int verity_alloc_zero_digest(struct dm_verity *v) { @@ -1053,6 +1059,7 @@ bad: return r; } +EXPORT_SYMBOL_GPL(verity_ctr); static struct target_type verity_target = { .name = "verity", From 8f5bf76871c346bfc4fed9fcf85eff79948aade2 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 11 Jun 2016 20:32:06 +0200 Subject: [PATCH 0254/3220] ipv6: fix endianness error in icmpv6_err IPv6 ping socket error handler doesn't correctly convert the new 32 bit mtu to host endianness before using. [Cherry-pick of net dcb94b88c09ce82a80e188d49bcffdc83ba215a6] Bug: 29370996 Change-Id: Iea0ca79f16c2a1366d82b3b0a3097093d18da8b7 Cc: Lorenzo Colitti Fixes: 6d0bfe22611602f ("net: ipv6: Add IPv6 support to the ping socket.") Signed-off-by: Hannes Frederic Sowa Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a3ec7a77a1ee..41e5c9520c7d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -98,7 +98,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) - ping_err(skb, offset, info); + ping_err(skb, offset, ntohl(info)); } static int icmpv6_rcv(struct sk_buff *skb); From 75c591ea5787a59e69c8d2b803a7f4f0c29ec34e Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Sat, 13 Aug 2016 01:13:38 +0900 Subject: [PATCH 0255/3220] net: ipv6: Fix ping to link-local addresses. ping_v6_sendmsg does not set flowi6_oif in response to sin6_scope_id or sk_bound_dev_if, so it is not possible to use these APIs to ping an IPv6 address on a different interface. Instead, it sets flowi6_iif, which is incorrect but harmless. Stop setting flowi6_iif, and support various ways of setting oif in the same priority order used by udpv6_sendmsg. [Backport of net 5e457896986e16c440c97bb94b9ccd95dd157292] Bug: 29370996 Change-Id: Ibe1b9434c00ed96f1e30acb110734c6570b087b8 Tested: https://android-review.googlesource.com/#/c/254470/ Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv6/ping.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 2cfaedf03252..9411c8d770a5 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -84,7 +84,7 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct icmp6hdr user_icmph; int addr_type; struct in6_addr *daddr; - int iif = 0; + int oif = 0; struct flowi6 fl6; int err; int hlimit; @@ -106,25 +106,30 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (u->sin6_family != AF_INET6) { return -EAFNOSUPPORT; } - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != u->sin6_scope_id) { - return -EINVAL; - } daddr = &(u->sin6_addr); - iif = u->sin6_scope_id; + if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr))) + oif = u->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; } - if (!iif) - iif = sk->sk_bound_dev_if; + if (!oif) + oif = sk->sk_bound_dev_if; + + if (!oif) + oif = np->sticky_pktinfo.ipi6_ifindex; + + if (!oif && ipv6_addr_is_multicast(daddr)) + oif = np->mcast_oif; + else if (!oif) + oif = np->ucast_oif; addr_type = ipv6_addr_type(daddr); - if (__ipv6_addr_needs_scope_id(addr_type) && !iif) - return -EINVAL; - if (addr_type & IPV6_ADDR_MAPPED) + if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) || + (addr_type & IPV6_ADDR_MAPPED) || + (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if)) return -EINVAL; /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ @@ -134,17 +139,13 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.saddr = np->saddr; fl6.daddr = *daddr; + fl6.flowi6_oif = oif; fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sock_i_uid(sk); fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; - else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; - dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -154,11 +155,6 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!np) return -EBADF; - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; - else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; - pfh.icmph.type = user_icmph.icmp6_type; pfh.icmph.code = user_icmph.icmp6_code; pfh.icmph.checksum = 0; From 3228c5eb7af2b4cb981706b88ed3c3e81ab8e80a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Aug 2016 08:44:12 -0700 Subject: [PATCH 0256/3220] RFC: FROMLIST: locking/percpu-rwsem: Optimize readers and reduce global impact Currently the percpu-rwsem switches to (global) atomic ops while a writer is waiting; which could be quite a while and slows down releasing the readers. This patch cures this problem by ordering the reader-state vs reader-count (see the comments in __percpu_down_read() and percpu_down_write()). This changes a global atomic op into a full memory barrier, which doesn't have the global cacheline contention. This also enables using the percpu-rwsem with rcu_sync disabled in order to bias the implementation differently, reducing the writer latency by adding some cost to readers. Mailing-list-URL: https://lkml.org/lkml/2016/8/9/181 Cc: Paul McKenney Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) [jstultz: Backported to 4.4] Change-Id: I8ea04b4dca2ec36f1c2469eccafde1423490572f Signed-off-by: John Stultz --- include/linux/percpu-rwsem.h | 84 +++++++++-- kernel/locking/percpu-rwsem.c | 265 +++++++++++++++++++--------------- 2 files changed, 225 insertions(+), 124 deletions(-) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index c2fa3ecb0dce..146efefde2a1 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -10,30 +10,96 @@ struct percpu_rw_semaphore { struct rcu_sync rss; - unsigned int __percpu *fast_read_ctr; + unsigned int __percpu *read_count; struct rw_semaphore rw_sem; - atomic_t slow_read_ctr; - wait_queue_head_t write_waitq; + wait_queue_head_t writer; + int readers_block; }; -extern void percpu_down_read(struct percpu_rw_semaphore *); -extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); -extern void percpu_up_read(struct percpu_rw_semaphore *); +extern int __percpu_down_read(struct percpu_rw_semaphore *, int); +extern void __percpu_up_read(struct percpu_rw_semaphore *); + +static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +{ + might_sleep(); + + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_); + + preempt_disable(); + /* + * We are in an RCU-sched read-side critical section, so the writer + * cannot both change sem->state from readers_fast and start checking + * counters while we are here. So if we see !sem->state, we know that + * the writer won't be checking until we're past the preempt_enable() + * and that one the synchronize_sched() is done, the writer will see + * anything we did within this RCU-sched read-size critical section. + */ + __this_cpu_inc(*sem->read_count); + if (unlikely(!rcu_sync_is_idle(&sem->rss))) + __percpu_down_read(sem, false); /* Unconditional memory barrier */ + preempt_enable(); + /* + * The barrier() from preempt_enable() prevents the compiler from + * bleeding the critical section out. + */ +} + +static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) +{ + int ret = 1; + + preempt_disable(); + /* + * Same as in percpu_down_read(). + */ + __this_cpu_inc(*sem->read_count); + if (unlikely(!rcu_sync_is_idle(&sem->rss))) + ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ + preempt_enable(); + /* + * The barrier() from preempt_enable() prevents the compiler from + * bleeding the critical section out. + */ + + if (ret) + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_); + + return ret; +} + +static inline void percpu_up_read(struct percpu_rw_semaphore *sem) +{ + /* + * The barrier() in preempt_disable() prevents the compiler from + * bleeding the critical section out. + */ + preempt_disable(); + /* + * Same as in percpu_down_read(). + */ + if (likely(rcu_sync_is_idle(&sem->rss))) + __this_cpu_dec(*sem->read_count); + else + __percpu_up_read(sem); /* Unconditional memory barrier */ + preempt_enable(); + + rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_); +} extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); + extern void percpu_free_rwsem(struct percpu_rw_semaphore *); -#define percpu_init_rwsem(brw) \ +#define percpu_init_rwsem(sem) \ ({ \ static struct lock_class_key rwsem_key; \ - __percpu_init_rwsem(brw, #brw, &rwsem_key); \ + __percpu_init_rwsem(sem, #sem, &rwsem_key); \ }) - #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f231e0bb311c..ce182599cf2e 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -8,151 +8,186 @@ #include #include -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, +int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) + sem->read_count = alloc_percpu(int); + if (unlikely(!sem->read_count)) return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); + rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + __init_rwsem(&sem->rw_sem, name, rwsem_key); + init_waitqueue_head(&sem->writer); + sem->readers_block = 0; return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +void percpu_free_rwsem(struct percpu_rw_semaphore *sem) { /* * XXX: temporary kludge. The error path in alloc_super() * assumes that percpu_free_rwsem() is safe after kzalloc(). */ - if (!brw->fast_read_ctr) + if (!sem->read_count) return; - rcu_sync_dtor(&brw->rss); - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ + rcu_sync_dtor(&sem->rss); + free_percpu(sem->read_count); + sem->read_count = NULL; /* catch use after free bugs */ } +EXPORT_SYMBOL_GPL(percpu_free_rwsem); -/* - * This is the fast-path for down_read/up_read. If it succeeds we rely - * on the barriers provided by rcu_sync_enter/exit; see the comments in - * percpu_down_write() and percpu_up_write(). - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) -{ - bool success; - - preempt_disable(); - success = rcu_sync_is_idle(&brw->rss); - if (likely(success)) - __this_cpu_add(*brw->fast_read_ctr, val); - preempt_enable(); - - return success; -} - -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); - - if (likely(update_fast_ctr(brw, +1))) - return; - - /* Avoid rwsem_acquire_read() and rwsem_release() */ - __down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); -} -EXPORT_SYMBOL_GPL(percpu_down_read); - -int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) -{ - if (unlikely(!update_fast_ctr(brw, +1))) { - if (!__down_read_trylock(&brw->rw_sem)) - return 0; - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); - } - - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); - return 1; -} - -void percpu_up_read(struct percpu_rw_semaphore *brw) -{ - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; - - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); -} -EXPORT_SYMBOL_GPL(percpu_up_read); - -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) -{ - unsigned int sum = 0; - int cpu; - - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } - - return sum; -} - -void percpu_down_write(struct percpu_rw_semaphore *brw) +int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) { /* - * Make rcu_sync_is_idle() == F and thus disable the fast-path in - * percpu_down_read() and percpu_up_read(), and wait for gp pass. + * Due to having preemption disabled the decrement happens on + * the same CPU as the increment, avoiding the + * increment-on-one-CPU-and-decrement-on-another problem. * - * The latter synchronises us with the preceding readers which used - * the fast-past, so we can not miss the result of __this_cpu_add() - * or anything else inside their criticial sections. + * If the reader misses the writer's assignment of readers_block, then + * the writer is guaranteed to see the reader's increment. + * + * Conversely, any readers that increment their sem->read_count after + * the writer looks are guaranteed to see the readers_block value, + * which in turn means that they are guaranteed to immediately + * decrement their sem->read_count, so that it doesn't matter that the + * writer missed them. */ - rcu_sync_enter(&brw->rss); - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); + smp_mb(); /* A matches D */ - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + /* + * If !readers_block the critical section starts here, matched by the + * release in percpu_up_write(). + */ + if (likely(!smp_load_acquire(&sem->readers_block))) + return 1; - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); + /* + * Per the above comment; we still have preemption disabled and + * will thus decrement on the same CPU as we incremented. + */ + __percpu_up_read(sem); + + if (try) + return 0; + + /* + * We either call schedule() in the wait, or we'll fall through + * and reschedule on the preempt_enable() in percpu_down_read(). + */ + preempt_enable_no_resched(); + + /* + * Avoid lockdep for the down/up_read() we already have them. + */ + __down_read(&sem->rw_sem); + this_cpu_inc(*sem->read_count); + __up_read(&sem->rw_sem); + + preempt_disable(); + return 1; +} +EXPORT_SYMBOL_GPL(__percpu_down_read); + +void __percpu_up_read(struct percpu_rw_semaphore *sem) +{ + smp_mb(); /* B matches C */ + /* + * In other words, if they see our decrement (presumably to aggregate + * zero, as that is the only time it matters) they will also see our + * critical section. + */ + __this_cpu_dec(*sem->read_count); + + /* Prod writer to recheck readers_active */ + wake_up(&sem->writer); +} +EXPORT_SYMBOL_GPL(__percpu_up_read); + +#define per_cpu_sum(var) \ +({ \ + typeof(var) __sum = 0; \ + int cpu; \ + compiletime_assert_atomic_type(__sum); \ + for_each_possible_cpu(cpu) \ + __sum += per_cpu(var, cpu); \ + __sum; \ +}) + +/* + * Return true if the modular sum of the sem->read_count per-CPU variable is + * zero. If this sum is zero, then it is stable due to the fact that if any + * newly arriving readers increment a given counter, they will immediately + * decrement that same counter. + */ +static bool readers_active_check(struct percpu_rw_semaphore *sem) +{ + if (per_cpu_sum(*sem->read_count) != 0) + return false; + + /* + * If we observed the decrement; ensure we see the entire critical + * section. + */ + + smp_mb(); /* C matches B */ + + return true; +} + +void percpu_down_write(struct percpu_rw_semaphore *sem) +{ + /* Notify readers to take the slow path. */ + rcu_sync_enter(&sem->rss); + + down_write(&sem->rw_sem); + + /* + * Notify new readers to block; up until now, and thus throughout the + * longish rcu_sync_enter() above, new readers could still come in. + */ + WRITE_ONCE(sem->readers_block, 1); + + smp_mb(); /* D matches A */ + + /* + * If they don't see our writer of readers_block, then we are + * guaranteed to see their sem->read_count increment, and therefore + * will wait for them. + */ + + /* Wait for all now active readers to complete. */ + wait_event(sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); -void percpu_up_write(struct percpu_rw_semaphore *brw) +void percpu_up_write(struct percpu_rw_semaphore *sem) { - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); /* - * Enable the fast-path in percpu_down_read() and percpu_up_read() - * but only after another gp pass; this adds the necessary barrier - * to ensure the reader can't miss the changes done by us. + * Signal the writer is done, no fast path yet. + * + * One reason that we cannot just immediately flip to readers_fast is + * that new readers might fail to see the results of this writer's + * critical section. + * + * Therefore we force it through the slow path which guarantees an + * acquire and thereby guarantees the critical section's consistency. */ - rcu_sync_exit(&brw->rss); + smp_store_release(&sem->readers_block, 0); + + /* + * Release the write lock, this will allow readers back in the game. + */ + up_write(&sem->rw_sem); + + /* + * Once this completes (at least one RCU-sched grace period hence) the + * reader fast path will be available again. Safe to use outside the + * exclusive write lock because its counting. + */ + rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); From 0c3240a1ef2e840aaa17f593326e3642bc857aa7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Aug 2016 18:54:13 +0200 Subject: [PATCH 0257/3220] RFC: FROMLIST: cgroup: avoid synchronize_sched() in __cgroup_procs_write() The current percpu-rwsem read side is entirely free of serializing insns at the cost of having a synchronize_sched() in the write path. The latency of the synchronize_sched() is too high for cgroups. The commit 1ed1328792ff talks about the write path being a fairly cold path but this is not the case for Android which moves task to the foreground cgroup and back around binder IPC calls from foreground processes to background processes, so it is significantly hotter than human initiated operations. Switch cgroup_threadgroup_rwsem into the slow mode for now to avoid the problem, hopefully it should not be that slow after another commit 80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce global impact"). We could just add rcu_sync_enter() into cgroup_init() but we do not want another synchronize_sched() at boot time, so this patch adds the new helper which doesn't block but currently can only be called before the first use. Cc: Tejun Heo Cc: Paul McKenney Reported-by: John Stultz Reported-by: Dmitry Shmidt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Oleg Nesterov [jstultz: backported to 4.4] Change-Id: I34aa9c394d3052779b56976693e96d861bd255f2 Mailing-list-URL: https://lkml.org/lkml/2016/8/11/557 Signed-off-by: John Stultz --- include/linux/rcu_sync.h | 1 + kernel/cgroup.c | 6 ++++++ kernel/rcu/sync.c | 12 ++++++++++++ 3 files changed, 19 insertions(+) diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index a63a33e6196e..ece7ed9a4a70 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) } extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type); +extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a57b7c86871c..bd541053efc2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5346,6 +5346,12 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + /* + * The latency of the synchronize_sched() is too high for cgroups, + * avoid it at the cost of forcing all readers into the slow path. + */ + rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); + mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be922c9f3d37..e358313a0d6c 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -82,6 +82,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) rsp->gp_type = type; } +/** + * Must be called after rcu_sync_init() and before first use. + * + * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() + * pairs turn into NO-OPs. + */ +void rcu_sync_enter_start(struct rcu_sync *rsp) +{ + rsp->gp_count++; + rsp->gp_state = GP_PASSED; +} + /** * rcu_sync_enter() - Force readers onto slowpath * @rsp: Pointer to rcu_sync structure to use for synchronization From e91f1799ff2cc3883907b5f3e141507f9716ff0e Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 10 Aug 2016 15:43:06 -0400 Subject: [PATCH 0258/3220] RFC: FROMLIST: cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork cgroup_threadgroup_rwsem is acquired in read mode during process exit and fork. It is also grabbed in write mode during __cgroups_proc_write(). I've recently run into a scenario with lots of memory pressure and OOM and I am beginning to see systemd __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 percpu_down_write+0x114/0x170 __cgroup_procs_write.isra.12+0xb8/0x3c0 cgroup_file_write+0x74/0x1a0 kernfs_fop_write+0x188/0x200 __vfs_write+0x6c/0xe0 vfs_write+0xc0/0x230 SyS_write+0x6c/0x110 system_call+0x38/0xb4 This thread is waiting on the reader of cgroup_threadgroup_rwsem to exit. The reader itself is under memory pressure and has gone into reclaim after fork. There are times the reader also ends up waiting on oom_lock as well. __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 jbd2_log_wait_commit+0xd4/0x180 ext4_evict_inode+0x88/0x5c0 evict+0xf8/0x2a0 dispose_list+0x50/0x80 prune_icache_sb+0x6c/0x90 super_cache_scan+0x190/0x210 shrink_slab.part.15+0x22c/0x4c0 shrink_zone+0x288/0x3c0 do_try_to_free_pages+0x1dc/0x590 try_to_free_pages+0xdc/0x260 __alloc_pages_nodemask+0x72c/0xc90 alloc_pages_current+0xb4/0x1a0 page_table_alloc+0xc0/0x170 __pte_alloc+0x58/0x1f0 copy_page_range+0x4ec/0x950 copy_process.isra.5+0x15a0/0x1870 _do_fork+0xa8/0x4b0 ppc_clone+0x8/0xc In the meanwhile, all processes exiting/forking are blocked almost stalling the system. This patch moves the threadgroup_change_begin from before cgroup_fork() to just before cgroup_canfork(). There is no nee to worry about threadgroup changes till the task is actually added to the threadgroup. This avoids having to call reclaim with cgroup_threadgroup_rwsem held. tj: Subject and description edits. Signed-off-by: Balbir Singh Acked-by: Zefan Li Cc: Oleg Nesterov Cc: Andrew Morton Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Tejun Heo [jstultz: Cherry-picked from: git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 568ac888215c7f] Change-Id: Ie8ece84fb613cf6a7b08cea1468473a8df2b9661 Signed-off-by: John Stultz --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 7ec6e9939b2c..d6a6da547e41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1370,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1522,6 +1521,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; + threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -1622,6 +1622,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cancel_cgroup: cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: + threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: @@ -1652,7 +1653,6 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); From ba52437821164deacf9af90d559f2e3f4888ff28 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 14 Jul 2016 11:38:40 -0400 Subject: [PATCH 0259/3220] BACKPORT: tcp: enable per-socket rate limiting of all 'challenge acks' (cherry picked from commit 083ae308280d13d187512b9babe3454342a7987e) The per-socket rate limit for 'challenge acks' was introduced in the context of limiting ack loops: commit f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock") And I think it can be extended to rate limit all 'challenge acks' on a per-socket basis. Since we have the global tcp_challenge_ack_limit, this patch allows for tcp_challenge_ack_limit to be set to a large value and effectively rely on the per-socket limit, or set tcp_challenge_ack_limit to a lower value and still prevents a single connections from consuming the entire challenge ack quota. It further moves in the direction of eliminating the global limit at some point, as Eric Dumazet has suggested. This a follow-up to: Subject: tcp: make challenge acks less predictable Cc: Eric Dumazet Cc: David S. Miller Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Yue Cao Signed-off-by: Jason Baron Signed-off-by: David S. Miller Change-Id: I622d5ae96e9387e775a0196c892d8d0e1a5564a7 Signed-off-by: Amit Pundir --- net/ipv4/tcp_input.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2014a701663a..2bf1110aa2ae 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3391,6 +3391,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } +static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, + u32 *last_oow_ack_time) +{ + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS_BH(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + + return false; /* not rate-limited: go ahead, send dupack now! */ +} + /* Return true if we're currently rate-limiting out-of-window ACKs and * thus shouldn't send a dupack right now. We rate-limit dupacks in * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS @@ -3404,21 +3421,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, /* Data packets without SYNs are not likely part of an ACK loop. */ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && !tcp_hdr(skb)->syn) - goto not_rate_limited; + return false; - if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); - - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { - NET_INC_STATS_BH(net, mib_idx); - return true; /* rate-limited: don't send yet! */ - } - } - - *last_oow_ack_time = tcp_time_stamp; - -not_rate_limited: - return false; /* not rate-limited: go ahead, send dupack now! */ + return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } /* RFC 5961 7 [ACK Throttling] */ @@ -3431,9 +3436,9 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) u32 count, now; /* First check our per-socket dupack rate limit. */ - if (tcp_oow_rate_limited(sock_net(sk), skb, - LINUX_MIB_TCPACKSKIPPEDCHALLENGE, - &tp->last_oow_ack_time)) + if (__tcp_oow_rate_limited(sock_net(sk), + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, + &tp->last_oow_ack_time)) return; /* Then check host-wide RFC 5961 rate limit. */ From 258a8f519a47a2b3c0a42940182715fc0de690e7 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 29 Aug 2016 17:31:10 -0700 Subject: [PATCH 0260/3220] UPSTREAM: Input: powermate - fix oops with malicious USB descriptors The powermate driver expects at least one valid USB endpoint in its probe function. If given malicious descriptors that specify 0 for the number of endpoints, it will crash. Validate the number of endpoints on the interface before using them. The full report for this issue can be found here: http://seclists.org/bugtraq/2016/Mar/85 BUG: 28242610 Reported-by: Ralf Spenneberg Signed-off-by: Josh Boyer Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: I1cb956a35f3bba73324240d5bd0a029f49d3c456 --- drivers/input/misc/powermate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/misc/powermate.c b/drivers/input/misc/powermate.c index 63b539d3daba..84909a12ff36 100644 --- a/drivers/input/misc/powermate.c +++ b/drivers/input/misc/powermate.c @@ -307,6 +307,9 @@ static int powermate_probe(struct usb_interface *intf, const struct usb_device_i int error = -ENOMEM; interface = intf->cur_altsetting; + if (interface->desc.bNumEndpoints < 1) + return -EINVAL; + endpoint = &interface->endpoint[0].desc; if (!usb_endpoint_is_int_in(endpoint)) return -EIO; From 125d90ca2fd862ecb119a085d95f813a6263d218 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 29 Aug 2016 17:33:52 -0700 Subject: [PATCH 0261/3220] UPSTREAM: USB: cypress_m8: add endpoint sanity check commit c55aee1bf0e6b6feec8b2927b43f7a09a6d5f754 upstream. An attack using missing endpoints exists. CVE-2016-3137 BUG: 28242610 Signed-off-by: Oliver Neukum Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: I1cc7957a5924175d24f12fdc41162ece67c907e5 --- drivers/usb/serial/cypress_m8.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c index 01bf53392819..244acb1299a9 100644 --- a/drivers/usb/serial/cypress_m8.c +++ b/drivers/usb/serial/cypress_m8.c @@ -447,6 +447,11 @@ static int cypress_generic_port_probe(struct usb_serial_port *port) struct usb_serial *serial = port->serial; struct cypress_private *priv; + if (!port->interrupt_out_urb || !port->interrupt_in_urb) { + dev_err(&port->dev, "required endpoint is missing\n"); + return -ENODEV; + } + priv = kzalloc(sizeof(struct cypress_private), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -606,12 +611,6 @@ static int cypress_open(struct tty_struct *tty, struct usb_serial_port *port) cypress_set_termios(tty, port, &priv->tmp_termios); /* setup the port and start reading from the device */ - if (!port->interrupt_in_urb) { - dev_err(&port->dev, "%s - interrupt_in_urb is empty!\n", - __func__); - return -1; - } - usb_fill_int_urb(port->interrupt_in_urb, serial->dev, usb_rcvintpipe(serial->dev, port->interrupt_in_endpointAddress), port->interrupt_in_urb->transfer_buffer, From ee5c7a9739c6ddd0d54385c313b5d6f87652cd63 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 30 Aug 2016 13:33:55 -0700 Subject: [PATCH 0262/3220] UPSTREAM: USB: mct_u232: add sanity checking in probe commit 4e9a0b05257f29cf4b75f3209243ed71614d062e upstream. An attack using the lack of sanity checking in probe is known. This patch checks for the existence of a second port. CVE-2016-3136 BUG: 28242610 Signed-off-by: Oliver Neukum [johan: add error message ] Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: I284ad648c2087c34a098d67e0cc6d948a568413c --- drivers/usb/serial/mct_u232.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c index fd707d6a10e2..89726f702202 100644 --- a/drivers/usb/serial/mct_u232.c +++ b/drivers/usb/serial/mct_u232.c @@ -376,14 +376,21 @@ static void mct_u232_msr_to_state(struct usb_serial_port *port, static int mct_u232_port_probe(struct usb_serial_port *port) { + struct usb_serial *serial = port->serial; struct mct_u232_private *priv; + /* check first to simplify error handling */ + if (!serial->port[1] || !serial->port[1]->interrupt_in_urb) { + dev_err(&port->dev, "expected endpoint missing\n"); + return -ENODEV; + } + priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; /* Use second interrupt-in endpoint for reading. */ - priv->read_urb = port->serial->port[1]->interrupt_in_urb; + priv->read_urb = serial->port[1]->interrupt_in_urb; priv->read_urb->context = port; spin_lock_init(&priv->lock); From ae6790121ceb4608e1a1dec10e285c8d661463d5 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 30 Aug 2016 13:35:32 -0700 Subject: [PATCH 0263/3220] UPSTREAM: USB: usb_driver_claim_interface: add sanity checking commit 0b818e3956fc1ad976bee791eadcbb3b5fec5bfd upstream. Attacks that trick drivers into passing a NULL pointer to usb_driver_claim_interface() using forged descriptors are known. This thwarts them by sanity checking. BUG: 28242610 Signed-off-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: Ib43ec5edb156985a9db941785a313f6801df092a --- drivers/usb/core/driver.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index 56593a9a8726..2057d91d8336 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -502,11 +502,15 @@ static int usb_unbind_interface(struct device *dev) int usb_driver_claim_interface(struct usb_driver *driver, struct usb_interface *iface, void *priv) { - struct device *dev = &iface->dev; + struct device *dev; struct usb_device *udev; int retval = 0; int lpm_disable_error; + if (!iface) + return -ENODEV; + + dev = &iface->dev; if (dev->driver) return -EBUSY; From b24e99b3ae1e3860d374d0ed497a371100533e83 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 30 Aug 2016 13:37:07 -0700 Subject: [PATCH 0264/3220] UPSTREAM: USB: iowarrior: fix oops with malicious USB descriptors commit 4ec0ef3a82125efc36173062a50624550a900ae0 upstream. The iowarrior driver expects at least one valid endpoint. If given malicious descriptors that specify 0 for the number of endpoints, it will crash in the probe function. Ensure there is at least one endpoint on the interface before using it. The full report of this issue can be found here: http://seclists.org/bugtraq/2016/Mar/87 BUG: 28242610 Reported-by: Ralf Spenneberg Signed-off-by: Josh Boyer Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: If5161c23928e9ef77cb3359cba9b36622b1908df --- drivers/usb/misc/iowarrior.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index c6bfd13f6c92..1950e87b4219 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -787,6 +787,12 @@ static int iowarrior_probe(struct usb_interface *interface, iface_desc = interface->cur_altsetting; dev->product_id = le16_to_cpu(udev->descriptor.idProduct); + if (iface_desc->desc.bNumEndpoints < 1) { + dev_err(&interface->dev, "Invalid number of endpoints\n"); + retval = -EINVAL; + goto error; + } + /* set up the endpoint information */ for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { endpoint = &iface_desc->endpoint[i].desc; From 6087158f06e3cc66897cb4571cb673ac9fffb12d Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 30 Aug 2016 13:39:02 -0700 Subject: [PATCH 0265/3220] UPSTREAM: USB: cdc-acm: more sanity checking commit 8835ba4a39cf53f705417b3b3a94eb067673f2c9 upstream. An attack has become available which pretends to be a quirky device circumventing normal sanity checks and crashes the kernel by an insufficient number of interfaces. This patch adds a check to the code path for quirky devices. BUG: 28242610 Signed-off-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman Signed-off-by: Badhri Jagan Sridharan Change-Id: I9a5f7f3c704b65e866335054f470451fcfae9d1c --- drivers/usb/class/cdc-acm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 26ca4f910cb0..a7732f80a912 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1113,6 +1113,9 @@ static int acm_probe(struct usb_interface *intf, if (quirks == NO_UNION_NORMAL) { data_interface = usb_ifnum_to_if(usb_dev, 1); control_interface = usb_ifnum_to_if(usb_dev, 0); + /* we would crash */ + if (!data_interface || !control_interface) + return -ENODEV; goto skip_normal_probe; } From 4547c0f93518197834f7fc11c6558b566b7e14c8 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 31 Aug 2016 09:52:16 -0700 Subject: [PATCH 0266/3220] ANDROID: rcu_sync: Export rcu_sync_lockdep_assert x86_64:allmodconfig fails to build with the following error. ERROR: "rcu_sync_lockdep_assert" [kernel/locking/locktorture.ko] undefined! Introduced by commit 3228c5eb7af2 ("RFC: FROMLIST: locking/percpu-rwsem: Optimize readers and reduce global impact"). The applied upstream version exports the missing symbol, so let's do the same. Change-Id: If4e516715c3415fe8c82090f287174857561550d Fixes: 3228c5eb7af2 ("RFC: FROMLIST: locking/percpu-rwsem: Optimize ...") Signed-off-by: Guenter Roeck --- kernel/rcu/sync.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index e358313a0d6c..b49cf3ac2d47 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -68,6 +68,7 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp) RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), "suspicious rcu_sync_is_idle() usage"); } +EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); #endif /** From a49dcf2e745c25a67a76d8a1e9658c16827c9885 Mon Sep 17 00:00:00 2001 From: Yongqin Liu Date: Thu, 1 Sep 2016 22:06:04 +0530 Subject: [PATCH 0267/3220] ANDROID: base-cfg: enable SECCOMP config Enable following seccomp configs CONFIG_SECCOMP=y CONFIG_SECCOMP_FILTER=y Otherwise we will get mediacode error like this on Android N: E /system/bin/mediaextractor: libminijail: prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER): Invalid argument Change-Id: I2477b6a2cfdded5c0ebf6ffbb6150b0e5fe2ba12 Signed-off-by: Yongqin Liu Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 6496bb3961a2..288bab2d858d 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -141,6 +141,8 @@ CONFIG_PROFILING=y CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y +CONFIG_SECCOMP=y +CONFIG_SECCOMP_FILTER=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y From af4104512cdb48d5eede620871d6a964a8680253 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 6 Apr 2016 14:06:48 +0100 Subject: [PATCH 0268/3220] UPSTREAM: assoc_array: don't call compare_object() on a node (cherry picked from commit 8d4a2ec1e0b41b0cf9a0c5cd4511da7f8e4f3de2) Changes since V1: fixed the description and added KASan warning. In assoc_array_insert_into_terminal_node(), we call the compare_object() method on all non-empty slots, even when they're not leaves, passing a pointer to an unexpected structure to compare_object(). Currently it causes an out-of-bound read access in keyring_compare_object detected by KASan (see below). The issue is easily reproduced with keyutils testsuite. Only call compare_object() when the slot is a leave. KASan warning: ================================================================== BUG: KASAN: slab-out-of-bounds in keyring_compare_object+0x213/0x240 at addr ffff880060a6f838 Read of size 8 by task keyctl/1655 ============================================================================= BUG kmalloc-192 (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in assoc_array_insert+0xfd0/0x3a60 age=69 cpu=1 pid=1647 ___slab_alloc+0x563/0x5c0 __slab_alloc+0x51/0x90 kmem_cache_alloc_trace+0x263/0x300 assoc_array_insert+0xfd0/0x3a60 __key_link_begin+0xfc/0x270 key_create_or_update+0x459/0xaf0 SyS_add_key+0x1ba/0x350 entry_SYSCALL_64_fastpath+0x12/0x76 INFO: Slab 0xffffea0001829b80 objects=16 used=8 fp=0xffff880060a6f550 flags=0x3fff8000004080 INFO: Object 0xffff880060a6f740 @offset=5952 fp=0xffff880060a6e5d1 Bytes b4 ffff880060a6f730: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f740: d1 e5 a6 60 00 88 ff ff 0e 00 00 00 00 00 00 00 ...`............ Object ffff880060a6f750: 02 cf 8e 60 00 88 ff ff 02 c0 8e 60 00 88 ff ff ...`.......`.... Object ffff880060a6f760: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f770: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f790: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7c0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7d0: 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object ffff880060a6f7f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ CPU: 0 PID: 1655 Comm: keyctl Tainted: G B 4.5.0-rc4-kasan+ #291 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 0000000000000000 000000001b2800b4 ffff880060a179e0 ffffffff81b60491 ffff88006c802900 ffff880060a6f740 ffff880060a17a10 ffffffff815e2969 ffff88006c802900 ffffea0001829b80 ffff880060a6f740 ffff880060a6e650 Call Trace: [] dump_stack+0x85/0xc4 [] print_trailer+0xf9/0x150 [] object_err+0x34/0x40 [] kasan_report_error+0x230/0x550 [] ? keyring_get_key_chunk+0x13e/0x210 [] __asan_report_load_n_noabort+0x5d/0x70 [] ? keyring_compare_object+0x213/0x240 [] keyring_compare_object+0x213/0x240 [] assoc_array_insert+0x86c/0x3a60 [] ? assoc_array_cancel_edit+0x70/0x70 [] ? __key_link_begin+0x20d/0x270 [] __key_link_begin+0xfc/0x270 [] key_create_or_update+0x459/0xaf0 [] ? trace_hardirqs_on+0xd/0x10 [] ? key_type_lookup+0xc0/0xc0 [] ? lookup_user_key+0x13d/0xcd0 [] ? memdup_user+0x53/0x80 [] SyS_add_key+0x1ba/0x350 [] ? key_get_type_from_user.constprop.6+0xa0/0xa0 [] ? retint_user+0x18/0x23 [] ? trace_hardirqs_on_caller+0x3fe/0x580 [] ? trace_hardirqs_on_thunk+0x17/0x19 [] entry_SYSCALL_64_fastpath+0x12/0x76 Memory state around the buggy address: ffff880060a6f700: fc fc fc fc fc fc fc fc 00 00 00 00 00 00 00 00 ffff880060a6f780: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc >ffff880060a6f800: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff880060a6f880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff880060a6f900: fc fc fc fc fc fc 00 00 00 00 00 00 00 00 00 00 ================================================================== Signed-off-by: Jerome Marchand Signed-off-by: David Howells cc: stable@vger.kernel.org Change-Id: I903935a221a5b9fb14cec14ef64bd2b6fa8eb222 Bug: 30513364 --- lib/assoc_array.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/assoc_array.c b/lib/assoc_array.c index 03dd576e6773..59fd7c0b119c 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -524,7 +524,9 @@ static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit, free_slot = i; continue; } - if (ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) { + if (assoc_array_ptr_is_leaf(ptr) && + ops->compare_object(assoc_array_ptr_to_leaf(ptr), + index_key)) { pr_devel("replace in slot %d\n", i); edit->leaf_p = &node->slots[i]; edit->dead_leaf = node->slots[i]; From 687549af9e5df34f8071c32aced7efae0b46fb58 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 29 Jul 2016 10:40:31 +0200 Subject: [PATCH 0269/3220] UPSTREAM: block: fix use-after-free in seq file (cherry picked from commit 77da160530dd1dc94f6ae15a981f24e5f0021e84) I got a KASAN report of use-after-free: ================================================================== BUG: KASAN: use-after-free in klist_iter_exit+0x61/0x70 at addr ffff8800b6581508 Read of size 8 by task trinity-c1/315 ============================================================================= BUG kmalloc-32 (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in disk_seqf_start+0x66/0x110 age=144 cpu=1 pid=315 ___slab_alloc+0x4f1/0x520 __slab_alloc.isra.58+0x56/0x80 kmem_cache_alloc_trace+0x260/0x2a0 disk_seqf_start+0x66/0x110 traverse+0x176/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a INFO: Freed in disk_seqf_stop+0x42/0x50 age=160 cpu=1 pid=315 __slab_free+0x17a/0x2c0 kfree+0x20a/0x220 disk_seqf_stop+0x42/0x50 traverse+0x3b5/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a CPU: 1 PID: 315 Comm: trinity-c1 Tainted: G B 4.7.0+ #62 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 ffffea0002d96000 ffff880119b9f918 ffffffff81d6ce81 ffff88011a804480 ffff8800b6581500 ffff880119b9f948 ffffffff8146c7bd ffff88011a804480 ffffea0002d96000 ffff8800b6581500 fffffffffffffff4 ffff880119b9f970 Call Trace: [] dump_stack+0x65/0x84 [] print_trailer+0x10d/0x1a0 [] object_err+0x2f/0x40 [] kasan_report_error+0x221/0x520 [] __asan_report_load8_noabort+0x3e/0x40 [] klist_iter_exit+0x61/0x70 [] class_dev_iter_exit+0x9/0x10 [] disk_seqf_stop+0x3a/0x50 [] seq_read+0x4b2/0x11a0 [] proc_reg_read+0xbc/0x180 [] do_loop_readv_writev+0x134/0x210 [] do_readv_writev+0x565/0x660 [] vfs_readv+0x67/0xa0 [] do_preadv+0x126/0x170 [] SyS_preadv+0xc/0x10 This problem can occur in the following situation: open() - pread() - .seq_start() - iter = kmalloc() // succeeds - seqf->private = iter - .seq_stop() - kfree(seqf->private) - pread() - .seq_start() - iter = kmalloc() // fails - .seq_stop() - class_dev_iter_exit(seqf->private) // boom! old pointer As the comment in disk_seqf_stop() says, stop is called even if start failed, so we need to reinitialise the private pointer to NULL when seq iteration stops. An alternative would be to set the private pointer to NULL when the kmalloc() in disk_seqf_start() fails. Cc: stable@vger.kernel.org Signed-off-by: Vegard Nossum Acked-by: Tejun Heo Signed-off-by: Jens Axboe Change-Id: I07b33f4b38341f60a37806cdd45b0a0c3ab4d84d Bug: 30942273 --- block/genhd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/genhd.c b/block/genhd.c index 817c67c9e45e..82bc52cad1c1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -831,6 +831,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) if (iter) { class_dev_iter_exit(iter); kfree(iter); + seqf->private = NULL; } } From c7a407d41ed5d2c6aa0c1892e1d9ebee58d5a928 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 28 Jan 2016 09:22:44 -0200 Subject: [PATCH 0270/3220] UPSTREAM: [media] xc2028: avoid use after free (cherry picked from commit 8dfbcc4351a0b6d2f2d77f367552f48ffefafe18) If struct xc2028_config is passed without a firmware name, the following trouble may happen: [11009.907205] xc2028 5-0061: type set to XCeive xc2028/xc3028 tuner [11009.907491] ================================================================== [11009.907750] BUG: KASAN: use-after-free in strcmp+0x96/0xb0 at addr ffff8803bd78ab40 [11009.907992] Read of size 1 by task modprobe/28992 [11009.907994] ============================================================================= [11009.907997] BUG kmalloc-16 (Tainted: G W ): kasan: bad access detected [11009.907999] ----------------------------------------------------------------------------- [11009.908008] INFO: Allocated in xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] age=0 cpu=3 pid=28992 [11009.908012] ___slab_alloc+0x581/0x5b0 [11009.908014] __slab_alloc+0x51/0x90 [11009.908017] __kmalloc+0x27b/0x350 [11009.908022] xhci_urb_enqueue+0x214/0x14c0 [xhci_hcd] [11009.908026] usb_hcd_submit_urb+0x1e8/0x1c60 [11009.908029] usb_submit_urb+0xb0e/0x1200 [11009.908032] usb_serial_generic_write_start+0xb6/0x4c0 [11009.908035] usb_serial_generic_write+0x92/0xc0 [11009.908039] usb_console_write+0x38a/0x560 [11009.908045] call_console_drivers.constprop.14+0x1ee/0x2c0 [11009.908051] console_unlock+0x40d/0x900 [11009.908056] vprintk_emit+0x4b4/0x830 [11009.908061] vprintk_default+0x1f/0x30 [11009.908064] printk+0x99/0xb5 [11009.908067] kasan_report_error+0x10a/0x550 [11009.908070] __asan_report_load1_noabort+0x43/0x50 [11009.908074] INFO: Freed in xc2028_set_config+0x90/0x630 [tuner_xc2028] age=1 cpu=3 pid=28992 [11009.908077] __slab_free+0x2ec/0x460 [11009.908080] kfree+0x266/0x280 [11009.908083] xc2028_set_config+0x90/0x630 [tuner_xc2028] [11009.908086] xc2028_attach+0x310/0x8a0 [tuner_xc2028] [11009.908090] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb] [11009.908094] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb] [11009.908098] em28xx_dvb_init+0x81/0x8a [em28xx_dvb] [11009.908101] em28xx_register_extension+0xd9/0x190 [em28xx] [11009.908105] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb] [11009.908108] do_one_initcall+0x141/0x300 [11009.908111] do_init_module+0x1d0/0x5ad [11009.908114] load_module+0x6666/0x9ba0 [11009.908117] SyS_finit_module+0x108/0x130 [11009.908120] entry_SYSCALL_64_fastpath+0x16/0x76 [11009.908123] INFO: Slab 0xffffea000ef5e280 objects=25 used=25 fp=0x (null) flags=0x2ffff8000004080 [11009.908126] INFO: Object 0xffff8803bd78ab40 @offset=2880 fp=0x0000000000000001 [11009.908130] Bytes b4 ffff8803bd78ab30: 01 00 00 00 2a 07 00 00 9d 28 00 00 01 00 00 00 ....*....(...... [11009.908133] Object ffff8803bd78ab40: 01 00 00 00 00 00 00 00 b0 1d c3 6a 00 88 ff ff ...........j.... [11009.908137] CPU: 3 PID: 28992 Comm: modprobe Tainted: G B W 4.5.0-rc1+ #43 [11009.908140] Hardware name: /NUC5i7RYB, BIOS RYBDWi35.86A.0350.2015.0812.1722 08/12/2015 [11009.908142] ffff8803bd78a000 ffff8802c273f1b8 ffffffff81932007 ffff8803c6407a80 [11009.908148] ffff8802c273f1e8 ffffffff81556759 ffff8803c6407a80 ffffea000ef5e280 [11009.908153] ffff8803bd78ab40 dffffc0000000000 ffff8802c273f210 ffffffff8155ccb4 [11009.908158] Call Trace: [11009.908162] [] dump_stack+0x4b/0x64 [11009.908165] [] print_trailer+0xf9/0x150 [11009.908168] [] object_err+0x34/0x40 [11009.908171] [] kasan_report_error+0x230/0x550 [11009.908175] [] ? trace_hardirqs_off_caller+0x21/0x290 [11009.908179] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908182] [] __asan_report_load1_noabort+0x43/0x50 [11009.908185] [] ? __asan_register_globals+0x50/0xa0 [11009.908189] [] ? strcmp+0x96/0xb0 [11009.908192] [] strcmp+0x96/0xb0 [11009.908196] [] xc2028_set_config+0x15c/0x630 [tuner_xc2028] [11009.908200] [] xc2028_attach+0x310/0x8a0 [tuner_xc2028] [11009.908203] [] ? memset+0x28/0x30 [11009.908206] [] ? xc2028_set_config+0x630/0x630 [tuner_xc2028] [11009.908211] [] em28xx_attach_xc3028.constprop.7+0x1f9/0x30d [em28xx_dvb] [11009.908215] [] ? em28xx_dvb_init.part.3+0x37c/0x5cf4 [em28xx_dvb] [11009.908219] [] ? hauppauge_hvr930c_init+0x487/0x487 [em28xx_dvb] [11009.908222] [] ? lgdt330x_attach+0x1cc/0x370 [lgdt330x] [11009.908226] [] ? i2c_read_demod_bytes.isra.2+0x210/0x210 [lgdt330x] [11009.908230] [] ? ref_module.part.15+0x10/0x10 [11009.908233] [] ? module_assert_mutex_or_preempt+0x80/0x80 [11009.908238] [] em28xx_dvb_init.part.3+0x8e4/0x5cf4 [em28xx_dvb] [11009.908242] [] ? em28xx_attach_xc3028.constprop.7+0x30d/0x30d [em28xx_dvb] [11009.908245] [] ? string+0x14d/0x1f0 [11009.908249] [] ? symbol_string+0xff/0x1a0 [11009.908253] [] ? uuid_string+0x6f0/0x6f0 [11009.908257] [] ? __kernel_text_address+0x7e/0xa0 [11009.908260] [] ? print_context_stack+0x7f/0xf0 [11009.908264] [] ? __module_address+0xb6/0x360 [11009.908268] [] ? is_ftrace_trampoline+0x99/0xe0 [11009.908271] [] ? __kernel_text_address+0x7e/0xa0 [11009.908275] [] ? debug_check_no_locks_freed+0x290/0x290 [11009.908278] [] ? dump_trace+0x11b/0x300 [11009.908282] [] ? em28xx_register_extension+0x23/0x190 [em28xx] [11009.908285] [] ? trace_hardirqs_off_caller+0x21/0x290 [11009.908289] [] ? trace_hardirqs_on_caller+0x16/0x590 [11009.908292] [] ? trace_hardirqs_on+0xd/0x10 [11009.908296] [] ? em28xx_register_extension+0x23/0x190 [em28xx] [11009.908299] [] ? mutex_trylock+0x400/0x400 [11009.908302] [] ? do_one_initcall+0x131/0x300 [11009.908306] [] ? call_rcu_sched+0x17/0x20 [11009.908309] [] ? put_object+0x48/0x70 [11009.908314] [] em28xx_dvb_init+0x81/0x8a [em28xx_dvb] [11009.908317] [] em28xx_register_extension+0xd9/0x190 [em28xx] [11009.908320] [] ? 0xffffffffa0150000 [11009.908324] [] em28xx_dvb_register+0x10/0x1000 [em28xx_dvb] [11009.908327] [] do_one_initcall+0x141/0x300 [11009.908330] [] ? try_to_run_init_process+0x40/0x40 [11009.908333] [] ? trace_hardirqs_on_caller+0x16/0x590 [11009.908337] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908340] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908343] [] ? kasan_unpoison_shadow+0x36/0x50 [11009.908346] [] ? __asan_register_globals+0x87/0xa0 [11009.908350] [] do_init_module+0x1d0/0x5ad [11009.908353] [] load_module+0x6666/0x9ba0 [11009.908356] [] ? symbol_put_addr+0x50/0x50 [11009.908361] [] ? em28xx_dvb_init.part.3+0x5989/0x5cf4 [em28xx_dvb] [11009.908366] [] ? module_frob_arch_sections+0x20/0x20 [11009.908369] [] ? open_exec+0x50/0x50 [11009.908374] [] ? ns_capable+0x5b/0xd0 [11009.908377] [] SyS_finit_module+0x108/0x130 [11009.908379] [] ? SyS_init_module+0x1f0/0x1f0 [11009.908383] [] ? lockdep_sys_exit_thunk+0x12/0x14 [11009.908394] [] entry_SYSCALL_64_fastpath+0x16/0x76 [11009.908396] Memory state around the buggy address: [11009.908398] ffff8803bd78aa00: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908401] ffff8803bd78aa80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908403] >ffff8803bd78ab00: fc fc fc fc fc fc fc fc 00 00 fc fc fc fc fc fc [11009.908405] ^ [11009.908407] ffff8803bd78ab80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908409] ffff8803bd78ac00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [11009.908411] ================================================================== In order to avoid it, let's set the cached value of the firmware name to NULL after freeing it. While here, return an error if the memory allocation fails. Signed-off-by: Mauro Carvalho Chehab Change-Id: I945c841dcfb45de2056267e4aa50bbe176b527cf Bug: 30946097 --- drivers/media/tuners/tuner-xc2028.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c index 4e941f00b600..082ff5608455 100644 --- a/drivers/media/tuners/tuner-xc2028.c +++ b/drivers/media/tuners/tuner-xc2028.c @@ -1403,11 +1403,12 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) * in order to avoid troubles during device release. */ kfree(priv->ctrl.fname); + priv->ctrl.fname = NULL; memcpy(&priv->ctrl, p, sizeof(priv->ctrl)); if (p->fname) { priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL); if (priv->ctrl.fname == NULL) - rc = -ENOMEM; + return -ENOMEM; } /* From 30439e51fc6f9ab169f7cb9e8e88faf7fb27ae99 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 3 Feb 2016 13:34:00 -0200 Subject: [PATCH 0271/3220] UPSTREAM: [media] xc2028: unlock on error in xc2028_set_config() (cherry picked from commit 210bd104c6acd31c3c6b8b075b3f12d4a9f6b60d) We have to unlock before returning -ENOMEM. Fixes: 8dfbcc4351a0 ('[media] xc2028: avoid use after free') Signed-off-by: Dan Carpenter Signed-off-by: Mauro Carvalho Chehab Change-Id: I7b6ba9fde5c6e29467e6de23d398af2fe56e2547 Bug: 30946097 --- drivers/media/tuners/tuner-xc2028.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/media/tuners/tuner-xc2028.c b/drivers/media/tuners/tuner-xc2028.c index 082ff5608455..317ef63ee789 100644 --- a/drivers/media/tuners/tuner-xc2028.c +++ b/drivers/media/tuners/tuner-xc2028.c @@ -1407,8 +1407,10 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) memcpy(&priv->ctrl, p, sizeof(priv->ctrl)); if (p->fname) { priv->ctrl.fname = kstrdup(p->fname, GFP_KERNEL); - if (priv->ctrl.fname == NULL) - return -ENOMEM; + if (priv->ctrl.fname == NULL) { + rc = -ENOMEM; + goto unlock; + } } /* @@ -1440,6 +1442,7 @@ static int xc2028_set_config(struct dvb_frontend *fe, void *priv_cfg) } else priv->state = XC2028_WAITING_FIRMWARE; } +unlock: mutex_unlock(&priv->lock); return rc; From a4e59955acf0ae25d805e0a0d251ac8f4b1808e7 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 5 May 2016 16:22:26 -0700 Subject: [PATCH 0272/3220] UPSTREAM: proc: prevent accessing /proc//environ until it's ready (cherry picked from commit 8148a73c9901a8794a50f950083c00ccf97d43b3) If /proc//environ gets read before the envp[] array is fully set up in create_{aout,elf,elf_fdpic,flat}_tables(), we might end up trying to read more bytes than are actually written, as env_start will already be set but env_end will still be zero, making the range calculation underflow, allowing to read beyond the end of what has been written. Fix this as it is done for /proc//cmdline by testing env_end for zero. It is, apparently, intentionally set last in create_*_tables(). This bug was found by the PaX size_overflow plugin that detected the arithmetic underflow of 'this_len = env_end - (env_start + src)' when env_end is still zero. The expected consequence is that userland trying to access /proc//environ of a not yet fully set up process may get inconsistent data as we're in the middle of copying in the environment variables. Fixes: https://forums.grsecurity.net/viewtopic.php?f=3&t=4363 Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=116461 Signed-off-by: Mathias Krause Cc: Emese Revfy Cc: Pax Team Cc: Al Viro Cc: Mateusz Guzik Cc: Alexey Dobriyan Cc: Cyrill Gorcunov Cc: Jarod Wilson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: Ia2f58d48c15478ed4b6e237b63e704c70ff21e96 Bug: 30951939 --- fs/proc/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 49348349e156..df715a095328 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -954,7 +954,8 @@ static ssize_t environ_read(struct file *file, char __user *buf, int ret = 0; struct mm_struct *mm = file->private_data; - if (!mm) + /* Ensure the process spawned far enough to have an environment. */ + if (!mm || !mm->env_end) return 0; page = (char *)__get_free_page(GFP_TEMPORARY); From 8e975e71f5123f4d2567a98f01641f7c02f8d2a2 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Fri, 2 Sep 2016 10:13:21 +0530 Subject: [PATCH 0273/3220] ANDROID: base-cfg: drop SECCOMP_FILTER config Don't need to set SECCOMP_FILTER explicitly since CONFIG_SECCOMP=y will select that config anyway. Fixes: a49dcf2e745c ("ANDROID: base-cfg: enable SECCOMP config") Change-Id: Iff18ed4d2db5a55b9f9480d5ecbeef7b818b3837 Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 288bab2d858d..34fa64a669ac 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -142,7 +142,6 @@ CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECCOMP=y -CONFIG_SECCOMP_FILTER=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y From d25e3081baa6f22564c1d87b7650c07c8c0b12b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Aug 2016 05:56:26 -0700 Subject: [PATCH 0274/3220] UPSTREAM: tcp: fix use after free in tcp_xmit_retransmit_queue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit bb1fceca22492109be12640d49f5ea5a544c6bb4) When tcp_sendmsg() allocates a fresh and empty skb, it puts it at the tail of the write queue using tcp_add_write_queue_tail() Then it attempts to copy user data into this fresh skb. If the copy fails, we undo the work and remove the fresh skb. Unfortunately, this undo lacks the change done to tp->highest_sack and we can leave a dangling pointer (to a freed skb) Later, tcp_xmit_retransmit_queue() can dereference this pointer and access freed memory. For regular kernels where memory is not unmapped, this might cause SACK bugs because tcp_highest_sack_seq() is buggy, returning garbage instead of tp->snd_nxt, but with various debug features like CONFIG_DEBUG_PAGEALLOC, this can crash the kernel. This bug was found by Marco Grassi thanks to syzkaller. Fixes: 6859d49475d4 ("[TCP]: Abstract tp->highest_sack accessing & point to next skb") Reported-by: Marco Grassi Signed-off-by: Eric Dumazet Cc: Ilpo Järvinen Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Reviewed-by: Cong Wang Signed-off-by: David S. Miller Change-Id: I58bb02d6e4e399612e8580b9e02d11e661df82f5 Bug: 31183296 --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index bcce99a951f8..30d1194ff1c5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1513,6 +1513,8 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli { if (sk->sk_send_head == skb_unlinked) sk->sk_send_head = NULL; + if (tcp_sk(sk)->highest_sack == skb_unlinked) + tcp_sk(sk)->highest_sack = NULL; } static inline void tcp_init_send_head(struct sock *sk) From 87518a45ae570ada6a1e404331a6cc910014e1a9 Mon Sep 17 00:00:00 2001 From: Mohamad Ayyash Date: Wed, 11 May 2016 13:18:35 -0700 Subject: [PATCH 0275/3220] BACKPORT: Don't show empty tag stats for unprivileged uids BUG: 27577101 BUG: 27532522 Signed-off-by: Mohamad Ayyash --- net/netfilter/xt_qtaguid.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index e2e7d54f9bb1..3bf0c59dab2f 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1946,7 +1946,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) ); f_count = atomic_long_read( &sock_tag_entry->socket->file->f_count); - seq_printf(m, "sock=%p tag=0x%llx (uid=%u) pid=%u " + seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u " "f_count=%lu\n", sock_tag_entry->sk, sock_tag_entry->tag, uid, @@ -2548,8 +2548,7 @@ static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry, uid_t stat_uid = get_uid_from_tag(tag); struct proc_print_info *ppi = m->private; /* Detailed tags are not available to everybody */ - if (get_atag_from_tag(tag) && !can_read_other_uid_stats( - make_kuid(&init_user_ns,stat_uid))) { + if (!can_read_other_uid_stats(make_kuid(&init_user_ns,stat_uid))) { CT_DEBUG("qtaguid: stats line: " "%s 0x%llx %u: insufficient priv " "from pid=%u tgid=%u uid=%u stats.gid=%u\n", From 8ac959cc06c8715cdc64aa88993b3a8b1ef61eaa Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 09:57:27 -0800 Subject: [PATCH 0276/3220] UPSTREAM: Add 'unsafe' user access functions for batched accesses The naming is meant to discourage random use: the helper functions are not really any more "unsafe" than the traditional double-underscore functions (which need the address range checking), but they do need even more infrastructure around them, and should not be used willy-nilly. In addition to checking the access range, these user access functions require that you wrap the user access with a "user_acess_{begin,end}()" around it. That allows architectures that implement kernel user access control (x86: SMAP, arm64: PAN) to do the user access control in the wrapping user_access_begin/end part, and then batch up the actual user space accesses using the new interfaces. The main (and hopefully only) use for these are for core generic access helpers, initially just the generic user string functions (strnlen_user() and strncpy_from_user()). Signed-off-by: Linus Torvalds Change-Id: Ic64efea41f97171bdbdabe3e531489aebd9b6fac (cherry picked from commit 5b24a7a2aa2040c8c50c3b71122901d01661ff78) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 25 +++++++++++++++++++++++++ include/linux/uaccess.h | 7 +++++++ 2 files changed, 32 insertions(+) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 09b1b0ab94b7..6c4c39b137e5 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -745,5 +745,30 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #undef __copy_from_user_overflow #undef __copy_to_user_overflow +/* + * The "unsafe" user accesses aren't really "unsafe", but the naming + * is a big fat warning: you have to not only do the access_ok() + * checking before using them, but you have to surround them with the + * user_access_begin/end() pair. + */ +#define user_access_begin() __uaccess_begin() +#define user_access_end() __uaccess_end() + +#define unsafe_put_user(x, ptr) \ +({ \ + int __pu_err; \ + __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ + __builtin_expect(__pu_err, 0); \ +}) + +#define unsafe_get_user(x, ptr) \ +({ \ + int __gu_err; \ + unsigned long __gu_val; \ + __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ + __builtin_expect(__gu_err, 0); \ +}) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 558129af828a..349557825428 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -111,4 +111,11 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #define probe_kernel_address(addr, retval) \ probe_kernel_read(&retval, addr, sizeof(retval)) +#ifndef user_access_begin +#define user_access_begin() do { } while (0) +#define user_access_end() do { } while (0) +#define unsafe_get_user(x, ptr) __get_user(x, ptr) +#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#endif + #endif /* __LINUX_UACCESS_H__ */ From c1932b10476e4d60f314bc6111101d1d101599b3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 10:05:19 -0800 Subject: [PATCH 0277/3220] UPSTREAM: Use the new batched user accesses in generic user string handling This converts the generic user string functions to use the batched user access functions. It makes a big difference on Skylake, which is the first x86 microarchitecture to implement SMAP. The STAC/CLAC instructions are not very fast, and doing them for each access inside the loop that copies strings from user space (which is what the pathname handling does for every pathname the kernel uses, for example) is very inefficient. Signed-off-by: Linus Torvalds Change-Id: Ic39a686b4bb1ad9cd16ad0887bb669beed6fe8aa (cherry picked from commit 9fd4470ff4974c41b1db43c3b355b9085af9c12a) Signed-off-by: Sami Tolvanen --- lib/strncpy_from_user.c | 11 ++++++++--- lib/strnlen_user.c | 18 ++++++++++++++---- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index e0af6ff73d14..33840324138c 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -39,7 +39,7 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) break; *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { @@ -55,7 +55,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(__get_user(c,src+res))) + if (unlikely(unsafe_get_user(c,src+res))) return -EFAULT; dst[res] = c; if (!c) @@ -107,7 +107,12 @@ long strncpy_from_user(char *dst, const char __user *src, long count) src_addr = (unsigned long)src; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strncpy_from_user(dst, src, count, max); + long retval; + + user_access_begin(); + retval = do_strncpy_from_user(dst, src, count, max); + user_access_end(); + return retval; } return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 3a5f2b366d84..2625943625d7 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,7 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(__get_user(c,(unsigned long __user *)src))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) return 0; c |= aligned_byte_mask(align); @@ -61,7 +61,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) + if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) return 0; } res -= align; @@ -112,7 +112,12 @@ long strnlen_user(const char __user *str, long count) src_addr = (unsigned long)str; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strnlen_user(str, count, max); + long retval; + + user_access_begin(); + retval = do_strnlen_user(str, count, max); + user_access_end(); + return retval; } return 0; } @@ -141,7 +146,12 @@ long strlen_user(const char __user *str) src_addr = (unsigned long)str; if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; - return do_strnlen_user(str, ~0ul, max); + long retval; + + user_access_begin(); + retval = do_strnlen_user(str, ~0ul, max); + user_access_end(); + return retval; } return 0; } From fed752db02277e34e92262daac24ad4216e79c56 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 21:28:47 +0100 Subject: [PATCH 0278/3220] BACKPORT: ARM: 8583/1: mm: fix location of _etext The _etext position is defined to be the end of the kernel text code, and should not include any part of the data segments. This interferes with things that might check memory ranges and expect executable code up to _etext. Just to be conservative, leave the kernel resource as it was, using __init_begin instead of _etext as the end mark. Signed-off-by: Kees Cook Signed-off-by: Russell King Change-Id: Ida514d1359dbe6f782f562ce29b4ba09ae72bfc0 (cherry picked from commit 14c4a533e0996f95a0a64dfd0b6252d788cebc74) Signed-off-by: Sami Tolvanen --- arch/arm/kernel/setup.c | 2 +- arch/arm/kernel/vmlinux.lds.S | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 20edd349d379..bf63b4693457 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -772,7 +772,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) struct resource *res; kernel_code.start = virt_to_phys(_text); - kernel_code.end = virt_to_phys(_etext - 1); + kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 8b60fde5ce48..be2ab6d3b91f 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -120,6 +120,8 @@ SECTIONS #ifdef CONFIG_DEBUG_RODATA . = ALIGN(1< Date: Thu, 23 Jun 2016 15:53:17 +0200 Subject: [PATCH 0279/3220] BACKPORT: arm64: mm: fix location of _etext As Kees Cook notes in the ARM counterpart of this patch [0]: The _etext position is defined to be the end of the kernel text code, and should not include any part of the data segments. This interferes with things that might check memory ranges and expect executable code up to _etext. In particular, Kees is referring to the HARDENED_USERCOPY patch set [1], which rejects attempts to call copy_to_user() on kernel ranges containing executable code, but does allow access to the .rodata segment. Regardless of whether one may or may not agree with the distinction, it makes sense for _etext to have the same meaning across architectures. So let's put _etext where it belongs, between .text and .rodata, and fix up existing references to use __init_begin instead, which unlike _end_rodata includes the exception and notes sections as well. The _etext references in kaslr.c are left untouched, since its references to [_stext, _etext) are meant to capture potential jump instruction targets, and so disregarding .rodata is actually an improvement here. [0] http://article.gmane.org/gmane.linux.kernel/2245084 [1] http://thread.gmane.org/gmane.linux.kernel.hardened.devel/2502 Reported-by: Kees Cook Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Reviewed-by: Kees Cook Signed-off-by: Catalin Marinas Change-Id: I8f6582525217b9ca324f6a382ea52d30ce1d0dbd (cherry picked from commit 9fdc14c55cd6579d619ccd9d40982e0805e62b6d) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/setup.c | 2 +- arch/arm64/kernel/vmlinux.lds.S | 3 ++- arch/arm64/mm/mmu.c | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 8119479147db..7f0e7226795e 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -201,7 +201,7 @@ static void __init request_standard_resources(void) struct resource *res; kernel_code.start = virt_to_phys(_text); - kernel_code.end = virt_to_phys(_etext - 1); + kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 71426a78db12..f472809a7b45 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -114,11 +114,12 @@ SECTIONS } ALIGN_DEBUG_RO + _etext = .; /* End of text section */ + RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(8) NOTES ALIGN_DEBUG_RO - _etext = .; /* End of text and rodata section */ ALIGN_DEBUG_RO_MIN(PAGE_SIZE) __init_begin = .; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 873e363048c6..9a8dc4a79d8b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -337,7 +337,6 @@ static void __init __map_memblock(phys_addr_t start, phys_addr_t end) end - kernel_x_end, PAGE_KERNEL); } - } #else static void __init __map_memblock(phys_addr_t start, phys_addr_t end) @@ -425,7 +424,7 @@ static void __init fixup_executable(void) void mark_rodata_ro(void) { create_mapping_late(__pa(_stext), (unsigned long)_stext, - (unsigned long)_etext - (unsigned long)_stext, + (unsigned long)__init_begin - (unsigned long)_stext, PAGE_KERNEL_ROX); } From 5f1b3400f0ccc2368dc444b5d9a7c93be2e700da Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Aug 2016 13:02:01 -0700 Subject: [PATCH 0280/3220] UPSTREAM: unsafe_[get|put]_user: change interface to use a error target label When I initially added the unsafe_[get|put]_user() helpers in commit 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses"), I made the mistake of modeling the interface on our traditional __[get|put]_user() functions, which return zero on success, or -EFAULT on failure. That interface is fairly easy to use, but it's actually fairly nasty for good code generation, since it essentially forces the caller to check the error value for each access. In particular, since the error handling is already internally implemented with an exception handler, and we already use "asm goto" for various other things, we could fairly easily make the error cases just jump directly to an error label instead, and avoid the need for explicit checking after each operation. So switch the interface to pass in an error label, rather than checking the error value in the caller. Best do it now before we start growing more users (the signal handling code in particular would be a good place to use the new interface). So rather than if (unsafe_get_user(x, ptr)) ... handle error .. the interface is now unsafe_get_user(x, ptr, label); where an error during the user mode fetch will now just cause a jump to 'label' in the caller. Right now the actual _implementation_ of this all still ends up being a "if (err) goto label", and does not take advantage of any exception label tricks, but for "unsafe_put_user()" in particular it should be fairly straightforward to convert to using the exception table model. Note that "unsafe_get_user()" is much harder to convert to a clever exception table model, because current versions of gcc do not allow the use of "asm goto" (for the exception) with output values (for the actual value to be fetched). But that is hopefully not a limitation in the long term. [ Also note that it might be a good idea to switch unsafe_get_user() to actually _return_ the value it fetches from user space, but this commit only changes the error handling semantics ] Signed-off-by: Linus Torvalds Change-Id: Ib905a84a04d46984320f6fd1056da4d72f3d6b53 (cherry picked from commit 1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 16 ++++++++-------- include/linux/uaccess.h | 4 ++-- lib/strncpy_from_user.c | 8 ++++---- lib/strnlen_user.c | 7 +++---- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 6c4c39b137e5..5abdd0221fb4 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -754,21 +754,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #define user_access_begin() __uaccess_begin() #define user_access_end() __uaccess_end() -#define unsafe_put_user(x, ptr) \ -({ \ +#define unsafe_put_user(x, ptr, err_label) \ +do { \ int __pu_err; \ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ - __builtin_expect(__pu_err, 0); \ -}) + if (unlikely(__pu_err)) goto err_label; \ +} while (0) -#define unsafe_get_user(x, ptr) \ -({ \ +#define unsafe_get_user(x, ptr, err_label) \ +do { \ int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __builtin_expect(__gu_err, 0); \ -}) + if (unlikely(__gu_err)) goto err_label; \ +} while (0) #endif /* _ASM_X86_UACCESS_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 349557825428..f30c187ed785 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -114,8 +114,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #ifndef user_access_begin #define user_access_begin() do { } while (0) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr) __get_user(x, ptr) -#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) +#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) #endif #endif /* __LINUX_UACCESS_H__ */ diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 33840324138c..5a003a2ebd96 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -39,8 +39,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - break; + unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); + *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); @@ -55,8 +55,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(unsafe_get_user(c,src+res))) - return -EFAULT; + unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; @@ -75,6 +74,7 @@ byte_at_a_time: * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ +efault: return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 2625943625d7..8e105ed4df12 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) - return 0; + unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { @@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - return 0; + unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; @@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ +efault: return 0; } From 1e701cdc5bbce1f5561ee25ff8709a18dc0b2282 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 19 Jul 2016 15:00:04 -0700 Subject: [PATCH 0281/3220] UPSTREAM: mm: Add is_migrate_cma_page Code such as hardened user copy[1] needs a way to tell if a page is CMA or not. Add is_migrate_cma_page in a similar way to is_migrate_isolate_page. [1]http://article.gmane.org/gmane.linux.kernel.mm/155238 Signed-off-by: Laura Abbott Signed-off-by: Kees Cook Change-Id: I1f9aa13d8d063038fa70b93282a836648fbb4f6d (cherry picked from commit 7c15d9bb8231f998ae7dc0b72415f5215459f7fb) Signed-off-by: Sami Tolvanen --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e23a9e704536..bab4053fb795 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -65,8 +65,10 @@ enum { #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) +# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false +# define is_migrate_cma_page(_page) false #endif #define for_each_migratetype_order(order, type) \ From c30d7340ee0b167a45cd8d6d5c48add8f62db9a5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jul 2016 16:19:48 -0700 Subject: [PATCH 0282/3220] BACKPORT: mm: Implement stack frame object validation This creates per-architecture function arch_within_stack_frames() that should validate if a given object is contained by a kernel stack frame. Initial implementation is on x86. This is based on code from PaX. Signed-off-by: Kees Cook Change-Id: I1f3b299bb8991d65dcdac6af85d633d4b7776df1 (cherry picked from commit 0f60a8efe4005ab5e65ce000724b04d4ca04a199) Signed-off-by: Sami Tolvanen --- arch/Kconfig | 9 ++++++ arch/x86/Kconfig | 1 + arch/x86/include/asm/thread_info.h | 44 ++++++++++++++++++++++++++++++ include/linux/thread_info.h | 9 ++++++ 4 files changed, 63 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 31a318a56d98..98f64ad1caf1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -423,6 +423,15 @@ config CC_STACKPROTECTOR_STRONG endchoice +config HAVE_ARCH_WITHIN_STACK_FRAMES + bool + help + An architecture should select this if it can walk the kernel stack + frames to determine if an object is part of either the arguments + or local variables (i.e. that it excludes saved return addresses, + and similar) by implementing an inline arch_within_stack_frames(), + which is used by CONFIG_HARDENED_USERCOPY. + config HAVE_CONTEXT_TRACKING bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ae3c83e8bde8..4786de7750b1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -88,6 +88,7 @@ config X86 select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_BPF_JIT if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c7b551028740..0c977fc124a7 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -177,6 +177,50 @@ static inline unsigned long current_stack_pointer(void) return sp; } +/* + * Walks up the stack frames to make sure that the specified object is + * entirely contained by a single stack frame. + * + * Returns: + * 1 if within a frame + * -1 if placed across a frame boundary (or outside stack) + * 0 unable to determine (no frame pointers, etc) + */ +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ +#if defined(CONFIG_FRAME_POINTER) + const void *frame = NULL; + const void *oldframe; + + oldframe = __builtin_frame_address(1); + if (oldframe) + frame = __builtin_frame_address(2); + /* + * low ----------------------------------------------> high + * [saved bp][saved ip][args][local vars][saved bp][saved ip] + * ^----------------^ + * allow copies only within here + */ + while (stack <= frame && frame < stackend) { + /* + * If obj + len extends past the last frame, this + * check won't pass and the next frame will be 0, + * causing us to bail out and correctly report + * the copy as invalid. + */ + if (obj + len <= frame) + return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1; + oldframe = frame; + frame = *(const void * const *)frame; + } + return -1; +#else + return 0; +#endif +} + #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ff307b548ed3..5ecb68e86968 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -145,6 +145,15 @@ static inline bool test_and_clear_restore_sigmask(void) #error "no set_restore_sigmask() provided and default one won't work" #endif +#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ From d677b3104d3c8e927211f20083a6c2e97e003b87 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 7 Jun 2016 11:05:33 -0700 Subject: [PATCH 0283/3220] BACKPORT: mm: Hardened usercopy This is the start of porting PAX_USERCOPY into the mainline kernel. This is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The work is based on code by PaX Team and Brad Spengler, and an earlier port from Casey Schaufler. Additional non-slab page tests are from Rik van Riel. This patch contains the logic for validating several conditions when performing copy_to_user() and copy_from_user() on the kernel object being copied to/from: - address range doesn't wrap around - address range isn't NULL or zero-allocated (with a non-zero copy size) - if on the slab allocator: - object size must be less than or equal to copy size (when check is implemented in the allocator, which appear in subsequent patches) - otherwise, object must not span page allocations (excepting Reserved and CMA ranges) - if on the stack - object must not extend before/after the current process stack - object must be contained by a valid stack frame (when there is arch/build support for identifying stack frames) - object must not overlap with kernel text Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Tested-by: Michael Ellerman Change-Id: Iff3b5f1ddb04acd99ccf9a9046c7797363962b2a (cherry picked from commit f5509cc18daa7f82bcc553be70df2117c8eedc16) Signed-off-by: Sami Tolvanen --- include/linux/slab.h | 12 ++ include/linux/thread_info.h | 15 ++ mm/Makefile | 4 + mm/usercopy.c | 269 ++++++++++++++++++++++++++++++++++++ security/Kconfig | 28 ++++ 5 files changed, 328 insertions(+) create mode 100644 mm/usercopy.c diff --git a/include/linux/slab.h b/include/linux/slab.h index 2037a861e367..4ef384b172e0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -144,6 +144,18 @@ void kfree(const void *); void kzfree(const void *); size_t ksize(const void *); +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page); +#else +static inline const char *__check_heap_object(const void *ptr, + unsigned long n, + struct page *page) +{ + return NULL; +} +#endif + /* * Some archs want to perform DMA into kmalloc caches and need a guaranteed * alignment larger than the alignment of a 64-bit integer. diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 5ecb68e86968..0ae29ff9ccfd 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -154,6 +154,21 @@ static inline int arch_within_stack_frames(const void * const stack, } #endif +#ifdef CONFIG_HARDENED_USERCOPY +extern void __check_object_size(const void *ptr, unsigned long n, + bool to_user); + +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ + __check_object_size(ptr, n, to_user); +} +#else +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ } +#endif /* CONFIG_HARDENED_USERCOPY */ + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ diff --git a/mm/Makefile b/mm/Makefile index 2ed43191fc3b..8b532c94008f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,6 +5,9 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slub.o := n +# Since __builtin_frame_address does work as used, disable the warning. +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address) + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ @@ -81,3 +84,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o +obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o diff --git a/mm/usercopy.c b/mm/usercopy.c new file mode 100644 index 000000000000..816feccebeb0 --- /dev/null +++ b/mm/usercopy.c @@ -0,0 +1,269 @@ +/* + * This implements the various checks for CONFIG_HARDENED_USERCOPY*, + * which are designed to protect kernel memory from needless exposure + * and overwrite under many unintended conditions. This code is based + * on PAX_USERCOPY, which is: + * + * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source + * Security Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +enum { + BAD_STACK = -1, + NOT_STACK = 0, + GOOD_FRAME, + GOOD_STACK, +}; + +/* + * Checks if a given pointer and length is contained by the current + * stack frame (if possible). + * + * Returns: + * NOT_STACK: not at all on the stack + * GOOD_FRAME: fully within a valid stack frame + * GOOD_STACK: fully on the stack (when can't do frame-checking) + * BAD_STACK: error condition (invalid stack position or bad stack frame) + */ +static noinline int check_stack_object(const void *obj, unsigned long len) +{ + const void * const stack = task_stack_page(current); + const void * const stackend = stack + THREAD_SIZE; + int ret; + + /* Object is not on the stack at all. */ + if (obj + len <= stack || stackend <= obj) + return NOT_STACK; + + /* + * Reject: object partially overlaps the stack (passing the + * the check above means at least one end is within the stack, + * so if this check fails, the other end is outside the stack). + */ + if (obj < stack || stackend < obj + len) + return BAD_STACK; + + /* Check if object is safely within a valid frame. */ + ret = arch_within_stack_frames(stack, stackend, obj, len); + if (ret) + return ret; + + return GOOD_STACK; +} + +static void report_usercopy(const void *ptr, unsigned long len, + bool to_user, const char *type) +{ + pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n", + to_user ? "exposure" : "overwrite", + to_user ? "from" : "to", ptr, type ? : "unknown", len); + /* + * For greater effect, it would be nice to do do_group_exit(), + * but BUG() actually hooks all the lock-breaking and per-arch + * Oops code, so that is used here instead. + */ + BUG(); +} + +/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ +static bool overlaps(const void *ptr, unsigned long n, unsigned long low, + unsigned long high) +{ + unsigned long check_low = (uintptr_t)ptr; + unsigned long check_high = check_low + n; + + /* Does not overlap if entirely above or entirely below. */ + if (check_low >= high || check_high < low) + return false; + + return true; +} + +/* Is this address range in the kernel text area? */ +static inline const char *check_kernel_text_object(const void *ptr, + unsigned long n) +{ + unsigned long textlow = (unsigned long)_stext; + unsigned long texthigh = (unsigned long)_etext; + unsigned long textlow_linear, texthigh_linear; + + if (overlaps(ptr, n, textlow, texthigh)) + return ""; + + /* + * Some architectures have virtual memory mappings with a secondary + * mapping of the kernel text, i.e. there is more than one virtual + * kernel address that points to the kernel image. It is usually + * when there is a separate linear physical memory mapping, in that + * __pa() is not just the reverse of __va(). This can be detected + * and checked: + */ + textlow_linear = (unsigned long)__va(__pa(textlow)); + /* No different mapping: we're done. */ + if (textlow_linear == textlow) + return NULL; + + /* Check the secondary mapping... */ + texthigh_linear = (unsigned long)__va(__pa(texthigh)); + if (overlaps(ptr, n, textlow_linear, texthigh_linear)) + return ""; + + return NULL; +} + +static inline const char *check_bogus_address(const void *ptr, unsigned long n) +{ + /* Reject if object wraps past end of memory. */ + if (ptr + n < ptr) + return ""; + + /* Reject if NULL or ZERO-allocation. */ + if (ZERO_OR_NULL_PTR(ptr)) + return ""; + + return NULL; +} + +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page, *endpage; + const void *end = ptr + n - 1; + bool is_reserved, is_cma; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + */ + if (is_vmalloc_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); + + /* + * Sometimes the kernel data regions are not marked Reserved (see + * check below). And sometimes [_sdata,_edata) does not cover + * rodata and/or bss, so check each range explicitly. + */ + + /* Allow reads of kernel rodata region (if not marked as Reserved). */ + if (ptr >= (const void *)__start_rodata && + end <= (const void *)__end_rodata) { + if (!to_user) + return ""; + return NULL; + } + + /* Allow kernel data region (if not marked as Reserved). */ + if (ptr >= (const void *)_sdata && end <= (const void *)_edata) + return NULL; + + /* Allow kernel bss region (if not marked as Reserved). */ + if (ptr >= (const void *)__bss_start && + end <= (const void *)__bss_stop) + return NULL; + + /* Is the object wholly within one base page? */ + if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) == + ((unsigned long)end & (unsigned long)PAGE_MASK))) + return NULL; + + /* Allow if start and end are inside the same compound page. */ + endpage = virt_to_head_page(end); + if (likely(endpage == page)) + return NULL; + + /* + * Reject if range is entirely either Reserved (i.e. special or + * device memory), or CMA. Otherwise, reject since the object spans + * several independently allocated pages. + */ + is_reserved = PageReserved(page); + is_cma = is_migrate_cma_page(page); + if (!is_reserved && !is_cma) + goto reject; + + for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { + page = virt_to_head_page(ptr); + if (is_reserved && !PageReserved(page)) + goto reject; + if (is_cma && !is_migrate_cma_page(page)) + goto reject; + } + + return NULL; + +reject: + return ""; +} + +/* + * Validates that the given object is: + * - not bogus address + * - known-safe heap or stack object + * - not in kernel text + */ +void __check_object_size(const void *ptr, unsigned long n, bool to_user) +{ + const char *err; + + /* Skip all tests if size is zero. */ + if (!n) + return; + + /* Check for invalid addresses. */ + err = check_bogus_address(ptr, n); + if (err) + goto report; + + /* Check for bad heap object. */ + err = check_heap_object(ptr, n, to_user); + if (err) + goto report; + + /* Check for bad stack object. */ + switch (check_stack_object(ptr, n)) { + case NOT_STACK: + /* Object is not touching the current process stack. */ + break; + case GOOD_FRAME: + case GOOD_STACK: + /* + * Object is either in the correct frame (when it + * is possible to check) or just generally on the + * process stack (when frame checking not available). + */ + return; + default: + err = ""; + goto report; + } + + /* Check for object in kernel to avoid text exposure. */ + err = check_kernel_text_object(ptr, n); + if (!err) + return; + +report: + report_usercopy(ptr, n, to_user, err); +} +EXPORT_SYMBOL(__check_object_size); diff --git a/security/Kconfig b/security/Kconfig index 30a2603e8c85..aba6a8d4d1f4 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -127,6 +127,34 @@ config LSM_MMAP_MIN_ADDR this low address space will need the permission specific to the systems running LSM. +config HAVE_HARDENED_USERCOPY_ALLOCATOR + bool + help + The heap allocator implements __check_heap_object() for + validating memory ranges against heap object sizes in + support of CONFIG_HARDENED_USERCOPY. + +config HAVE_ARCH_HARDENED_USERCOPY + bool + help + The architecture supports CONFIG_HARDENED_USERCOPY by + calling check_object_size() just before performing the + userspace copies in the low level implementation of + copy_to_user() and copy_from_user(). + +config HARDENED_USERCOPY + bool "Harden memory copies between kernel and userspace" + depends on HAVE_ARCH_HARDENED_USERCOPY + select BUG + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and + copy_from_user() functions) by rejecting memory ranges that + are larger than the specified heap object, span multiple + separately allocates pages, are not on the process stack, + or are part of the kernel text. This kills entire classes + of heap overflow exploits and similar kernel memory exposures. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig From 9c61fcc2b39760263436b13b97071ae6e8eeb5f2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:04:01 -0700 Subject: [PATCH 0284/3220] BACKPORT: x86/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on x86. This is done both in copy_*_user() and __copy_*_user() because copy_*_user() actually calls down to _copy_*_user() and not __copy_*_user(). Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Change-Id: I260db1d4572bdd2f779200aca99d03a170658440 (cherry picked from commit 5b710f34e194c6b7710f69fdb5d798fdf35b98c1) Signed-off-by: Sami Tolvanen --- arch/x86/Kconfig | 1 + arch/x86/include/asm/uaccess.h | 10 ++++++---- arch/x86/include/asm/uaccess_32.h | 2 ++ arch/x86/include/asm/uaccess_64.h | 2 ++ 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4786de7750b1..9e3b1e57499a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -79,6 +79,7 @@ config X86 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_AOUT if X86_32 select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5abdd0221fb4..32a047ce0806 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -714,9 +714,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n) * case, and do only runtime checking for non-constant sizes. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(to, n, false); n = _copy_from_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_from_user_overflow(); else __copy_from_user_overflow(sz, n); @@ -732,9 +733,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n) might_fault(); /* See the comment in copy_from_user() above. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(from, n, true); n = _copy_to_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_to_user_overflow(); else __copy_to_user_overflow(sz, n); diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index f5dcb5204dcd..2ffc9188cc70 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -43,6 +43,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + check_object_size(from, n, true); if (__builtin_constant_p(n)) { unsigned long ret; @@ -143,6 +144,7 @@ static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + check_object_size(to, n, false); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f2f9b39b274a..512ffd621fc3 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -53,6 +53,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) { int ret = 0; + check_object_size(dst, size, false); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -103,6 +104,7 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) { int ret = 0; + check_object_size(src, size, true); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { From 073fcda55b357777cc3edaa2e67060cd8d78dbb3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:06:53 -0700 Subject: [PATCH 0285/3220] BACKPORT: ARM: uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on arm. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Change-Id: I03a44ca7a8c56832f15a6a74ac32e9330df3ac3b (cherry picked from commit dfd45b6103c973bfcea2341d89e36faf947dbc33) Signed-off-by: Sami Tolvanen --- arch/arm/Kconfig | 1 + arch/arm/include/asm/uaccess.h | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index d3159ff4de5b..586134e01928 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -33,6 +33,7 @@ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT) select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 select HAVE_ARCH_MMAP_RND_BITS if MMU diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 35c9db857ebe..7fb59199c6bb 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -496,7 +496,10 @@ arm_copy_from_user(void *to, const void __user *from, unsigned long n); static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) { - unsigned int __ua_flags = uaccess_save_and_enable(); + unsigned int __ua_flags; + + check_object_size(to, n, false); + __ua_flags = uaccess_save_and_enable(); n = arm_copy_from_user(to, from, n); uaccess_restore(__ua_flags); return n; @@ -511,11 +514,15 @@ static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { #ifndef CONFIG_UACCESS_WITH_MEMCPY - unsigned int __ua_flags = uaccess_save_and_enable(); + unsigned int __ua_flags; + + check_object_size(from, n, true); + __ua_flags = uaccess_save_and_enable(); n = arm_copy_to_user(to, from, n); uaccess_restore(__ua_flags); return n; #else + check_object_size(from, n, true); return arm_copy_to_user(to, from, n); #endif } From 2f40fdd6bc0e1beca522657d3c9242e7b25917a2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:59:42 -0700 Subject: [PATCH 0286/3220] BACKPORT: arm64/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on arm64. As done by KASAN in -next, renames the low-level functions to __arch_copy_*_user() so a static inline can do additional work before the copy. Signed-off-by: Kees Cook Change-Id: I1286cae8e6ffcf12ea54ddd62f1a6d2ce742c8d0 (cherry picked from commit faf5b63e294151d6ac24ca6906d6f221bd3496cd) Signed-off-by: Sami Tolvanen --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/uaccess.h | 29 ++++++++++++++++++++++------- arch/arm64/kernel/arm64ksyms.c | 4 ++-- arch/arm64/lib/copy_from_user.S | 4 ++-- arch/arm64/lib/copy_to_user.S | 4 ++-- 5 files changed, 29 insertions(+), 13 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 579ab688312d..3ad9d9ea374b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -48,6 +48,7 @@ config ARM64 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KGDB diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index b2ede967fe7d..2a44b9795532 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -247,24 +247,39 @@ do { \ -EFAULT; \ }) -extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n); -extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n); +extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); +extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n); extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); +static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) +{ + check_object_size(to, n, false); + return __arch_copy_from_user(to, from, n); +} + +static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) +{ + check_object_size(from, n, true); + return __arch_copy_to_user(to, from, n); +} + static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else /* security hole - plug it */ + if (access_ok(VERIFY_READ, from, n)) { + check_object_size(to, n, false); + n = __arch_copy_from_user(to, from, n); + } else /* security hole - plug it */ memset(to, 0, n); return n; } static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); + if (access_ok(VERIFY_WRITE, to, n)) { + check_object_size(from, n, true); + n = __arch_copy_to_user(to, from, n); + } return n; } diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index 3b6d8cc9dfe0..c654df05b7d7 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -33,8 +33,8 @@ EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); /* user mem (segment) */ -EXPORT_SYMBOL(__copy_from_user); -EXPORT_SYMBOL(__copy_to_user); +EXPORT_SYMBOL(__arch_copy_from_user); +EXPORT_SYMBOL(__arch_copy_to_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_in_user); diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 4699cd74f87e..281e75db899a 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -66,7 +66,7 @@ .endm end .req x5 -ENTRY(__copy_from_user) +ENTRY(__arch_copy_from_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) add end, x0, x2 @@ -75,7 +75,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) mov x0, #0 // Nothing to copy ret -ENDPROC(__copy_from_user) +ENDPROC(__arch_copy_from_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 7512bbbc07ac..db4d187de61f 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -65,7 +65,7 @@ .endm end .req x5 -ENTRY(__copy_to_user) +ENTRY(__arch_copy_to_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) add end, x0, x2 @@ -74,7 +74,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) mov x0, #0 ret -ENDPROC(__copy_to_user) +ENDPROC(__arch_copy_to_user) .section .fixup,"ax" .align 2 From 103aa7df683e7811e88e82f52c2c3f16e419a7b1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:20:59 -0700 Subject: [PATCH 0287/3220] UPSTREAM: mm: SLAB hardened usercopy support Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the SLAB allocator to catch any copies that may span objects. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks Change-Id: Ib910a71fdc2ab808e1a45b6d33e9bae1681a1f4a (cherry picked from commit 04385fc5e8fffed84425d909a783c0f0c587d847) Signed-off-by: Sami Tolvanen --- init/Kconfig | 1 + mm/slab.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 235c7a2c0d20..fa031a140397 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1719,6 +1719,7 @@ choice config SLAB bool "SLAB" + select HAVE_HARDENED_USERCOPY_ALLOCATOR help The regular slab allocator that is established and known to work well in all environments. It organizes cache hot objects in diff --git a/mm/slab.c b/mm/slab.c index 4765c97ce690..24a615d42d74 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4228,6 +4228,36 @@ static int __init slab_proc_init(void) module_init(slab_proc_init); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *cachep; + unsigned int objnr; + unsigned long offset; + + /* Find and validate object. */ + cachep = page->slab_cache; + objnr = obj_to_index(cachep, page, (void *)ptr); + BUG_ON(objnr >= cachep->num); + + /* Find offset within object. */ + offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); + + /* Allow address range falling entirely within object size. */ + if (offset <= cachep->object_size && n <= cachep->object_size - offset) + return NULL; + + return cachep->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + /** * ksize - get the actual amount of memory allocated for a given object * @objp: Pointer to the object From 3a9c8260c66ce387754641134089ba6ea5793bd5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:24:05 -0700 Subject: [PATCH 0288/3220] UPSTREAM: mm: SLUB hardened usercopy support Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the SLUB allocator to catch any copies that may span objects. Includes a redzone handling fix discovered by Michael Ellerman. Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Michael Ellerman Reviwed-by: Laura Abbott Change-Id: I52dc6fb3a3492b937d52b5cf9c046bf03dc40a3a (cherry picked from commit ed18adc1cdd00a5c55a20fbdaed4804660772281) Signed-off-by: Sami Tolvanen --- init/Kconfig | 1 + mm/slub.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index fa031a140397..e1d1d6936f92 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1727,6 +1727,7 @@ config SLAB config SLUB bool "SLUB (Unqueued Allocator)" + select HAVE_HARDENED_USERCOPY_ALLOCATOR help SLUB is a slab allocator that minimizes cache line usage instead of managing queues of cached objects (SLAB approach). diff --git a/mm/slub.c b/mm/slub.c index 46997517406e..470f5e561727 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3585,6 +3585,46 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) EXPORT_SYMBOL(__kmalloc_node); #endif +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects objects that are incorrectly sized. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +const char *__check_heap_object(const void *ptr, unsigned long n, + struct page *page) +{ + struct kmem_cache *s; + unsigned long offset; + size_t object_size; + + /* Find object and usable object size. */ + s = page->slab_cache; + object_size = slab_ksize(s); + + /* Reject impossible pointers. */ + if (ptr < page_address(page)) + return s->name; + + /* Find offset within object. */ + offset = (ptr - page_address(page)) % s->size; + + /* Adjust for redzone and reject if within the redzone. */ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) { + if (offset < s->red_left_pad) + return s->name; + offset -= s->red_left_pad; + } + + /* Allow address range falling entirely within object size. */ + if (offset <= object_size && n <= object_size - offset) + return NULL; + + return s->name; +} +#endif /* CONFIG_HARDENED_USERCOPY */ + static size_t __ksize(const void *object) { struct page *page; From 43243ce74a56007bf71e4fb3310f475fd4441ce4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Aug 2016 12:15:22 -0700 Subject: [PATCH 0289/3220] UPSTREAM: usercopy: avoid potentially undefined behavior in pointer math check_bogus_address() checked for pointer overflow using this expression, where 'ptr' has type 'const void *': ptr + n < ptr Since pointer wraparound is undefined behavior, gcc at -O2 by default treats it like the following, which would not behave as intended: (long)n < 0 Fortunately, this doesn't currently happen for kernel code because kernel code is compiled with -fno-strict-overflow. But the expression should be fixed anyway to use well-defined integer arithmetic, since it could be treated differently by different compilers in the future or could be reported by tools checking for undefined behavior. Signed-off-by: Eric Biggers Signed-off-by: Kees Cook Change-Id: I73b13be651cf35c03482f2014bf2c3dd291518ab (cherry picked from commit 7329a655875a2f4bd6984fe8a7e00a6981e802f3) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index 816feccebeb0..f5c4c3a6f2df 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -125,7 +125,7 @@ static inline const char *check_kernel_text_object(const void *ptr, static inline const char *check_bogus_address(const void *ptr, unsigned long n) { /* Reject if object wraps past end of memory. */ - if (ptr + n < ptr) + if ((unsigned long)ptr + n < (unsigned long)ptr) return ""; /* Reject if NULL or ZERO-allocation. */ From 19787e3f1cd696356b4ec02fa3a345dcb4aaf3dc Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 22 Aug 2016 11:53:59 -0500 Subject: [PATCH 0290/3220] UPSTREAM: usercopy: fix overlap check for kernel text When running with a local patch which moves the '_stext' symbol to the very beginning of the kernel text area, I got the following panic with CONFIG_HARDENED_USERCOPY: usercopy: kernel memory exposure attempt detected from ffff88103dfff000 () (4096 bytes) ------------[ cut here ]------------ kernel BUG at mm/usercopy.c:79! invalid opcode: 0000 [#1] SMP ... CPU: 0 PID: 4800 Comm: cp Not tainted 4.8.0-rc3.after+ #1 Hardware name: Dell Inc. PowerEdge R720/0X3D66, BIOS 2.5.4 01/22/2016 task: ffff880817444140 task.stack: ffff880816274000 RIP: 0010:[] __check_object_size+0x76/0x413 RSP: 0018:ffff880816277c40 EFLAGS: 00010246 RAX: 000000000000006b RBX: ffff88103dfff000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88081f80dfa8 RDI: ffff88081f80dfa8 RBP: ffff880816277c90 R08: 000000000000054c R09: 0000000000000000 R10: 0000000000000005 R11: 0000000000000006 R12: 0000000000001000 R13: ffff88103e000000 R14: ffff88103dffffff R15: 0000000000000001 FS: 00007fb9d1750800(0000) GS:ffff88081f800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000021d2000 CR3: 000000081a08f000 CR4: 00000000001406f0 Stack: ffff880816277cc8 0000000000010000 000000043de07000 0000000000000000 0000000000001000 ffff880816277e60 0000000000001000 ffff880816277e28 000000000000c000 0000000000001000 ffff880816277ce8 ffffffff8136c3a6 Call Trace: [] copy_page_to_iter_iovec+0xa6/0x1c0 [] copy_page_to_iter+0x16/0x90 [] generic_file_read_iter+0x3e3/0x7c0 [] ? xfs_file_buffered_aio_write+0xad/0x260 [xfs] [] ? down_read+0x12/0x40 [] xfs_file_buffered_aio_read+0x51/0xc0 [xfs] [] xfs_file_read_iter+0x62/0xb0 [xfs] [] __vfs_read+0xdf/0x130 [] vfs_read+0x8e/0x140 [] SyS_read+0x55/0xc0 [] do_syscall_64+0x67/0x160 [] entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:[<00007fb9d0c33c00>] 0x7fb9d0c33c00 RSP: 002b:00007ffc9c262f28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: fffffffffff8ffff RCX: 00007fb9d0c33c00 RDX: 0000000000010000 RSI: 00000000021c3000 RDI: 0000000000000004 RBP: 00000000021c3000 R08: 0000000000000000 R09: 00007ffc9c264d6c R10: 00007ffc9c262c50 R11: 0000000000000246 R12: 0000000000010000 R13: 00007ffc9c2630b0 R14: 0000000000000004 R15: 0000000000010000 Code: 81 48 0f 44 d0 48 c7 c6 90 4d a3 81 48 c7 c0 bb b3 a2 81 48 0f 44 f0 4d 89 e1 48 89 d9 48 c7 c7 68 16 a3 81 31 c0 e8 f4 57 f7 ff <0f> 0b 48 8d 90 00 40 00 00 48 39 d3 0f 83 22 01 00 00 48 39 c3 RIP [] __check_object_size+0x76/0x413 RSP The checked object's range [ffff88103dfff000, ffff88103e000000) is valid, so there shouldn't have been a BUG. The hardened usercopy code got confused because the range's ending address is the same as the kernel's text starting address at 0xffff88103e000000. The overlap check is slightly off. Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Josh Poimboeuf Signed-off-by: Kees Cook Change-Id: I839dbf4ddbb4d9874026a42abed557eb9b3f8bef (cherry picked from commit 94cd97af690dd9537818dc9841d0ec68bb1dd877) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index f5c4c3a6f2df..f78015e8b1e5 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -84,7 +84,7 @@ static bool overlaps(const void *ptr, unsigned long n, unsigned long low, unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ - if (check_low >= high || check_high < low) + if (check_low >= high || check_high <= low) return false; return true; From 65733da1dd5c3643e310d371bdb2420a6f3d8e97 Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Thu, 25 Aug 2016 18:31:01 -0700 Subject: [PATCH 0291/3220] Android: MMC/UFS IO Latency Histograms. This patch adds a new sysfs node (latency_hist) and reports IO (svc time) latency histograms. Disabled by default, can be enabled by echoing 0 into latency_hist, stats can be cleared by writing 2 into latency_hist. This commit fixes the 32 bit build breakage in the previous commit. Tested on both 32 bit and 64 bit arm devices. Bug: 30677035 Change-Id: I9a615a16616d80f87e75676ac4d078a5c429dcf9 Signed-off-by: Mohan Srinivasan --- block/blk-core.c | 82 +++++++++++++++++++++++++++++++++++++++ drivers/mmc/core/core.c | 66 ++++++++++++++++++++++++++++++- drivers/mmc/core/host.c | 6 ++- drivers/mmc/core/host.h | 5 +++ drivers/scsi/ufs/ufshcd.c | 81 ++++++++++++++++++++++++++++++++++++++ drivers/scsi/ufs/ufshcd.h | 3 ++ include/linux/blkdev.h | 76 ++++++++++++++++++++++++++++++++++++ include/linux/mmc/core.h | 2 + include/linux/mmc/host.h | 4 ++ 9 files changed, 322 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 33e2f62d5062..840713279726 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -40,6 +40,8 @@ #include "blk.h" #include "blk-mq.h" +#include + EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); @@ -3539,3 +3541,83 @@ int __init blk_dev_init(void) return 0; } + +/* + * Blk IO latency support. We want this to be as cheap as possible, so doing + * this lockless (and avoiding atomics), a few off by a few errors in this + * code is not harmful, and we don't want to do anything that is + * perf-impactful. + * TODO : If necessary, we can make the histograms per-cpu and aggregate + * them when printing them out. + */ +void +blk_zero_latency_hist(struct io_latency_state *s) +{ + memset(s->latency_y_axis_read, 0, + sizeof(s->latency_y_axis_read)); + memset(s->latency_y_axis_write, 0, + sizeof(s->latency_y_axis_write)); + s->latency_reads_elems = 0; + s->latency_writes_elems = 0; +} + +ssize_t +blk_latency_hist_show(struct io_latency_state *s, char *buf) +{ + int i; + int bytes_written = 0; + u_int64_t num_elem, elem; + int pct; + + num_elem = s->latency_reads_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Read Latency Histogram (n = %llu):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_read[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_read[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + num_elem = s->latency_writes_elems; + if (num_elem > 0) { + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "IO svc_time Write Latency Histogram (n = %llu):\n", + num_elem); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis_write[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %5lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis_write[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t> %5dms%15llu%15d%%\n", 10, + elem, pct); + } + return bytes_written; +} diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 9fab52559a8c..b503249950df 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -183,6 +183,17 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); + if (mrq->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + mrq->io_start); + blk_update_latency_hist(&host->io_lat_s, + (mrq->data->flags & MMC_DATA_READ), + delta_us); + } trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -627,6 +638,11 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { + if (host->latency_hist_enabled) { + areq->mrq->io_start = ktime_get(); + areq->mrq->lat_hist_enabled = 1; + } else + areq->mrq->lat_hist_enabled = 0; trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -1964,7 +1980,7 @@ void mmc_init_erase(struct mmc_card *card) } static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, - unsigned int arg, unsigned int qty) + unsigned int arg, unsigned int qty) { unsigned int erase_timeout; @@ -2907,6 +2923,54 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } +static ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + + return blk_latency_hist_show(&host->io_lat_s, buf); +} + +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mmc_host *host = cls_dev_to_mmc_host(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&host->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + host->latency_hist_enabled = value; + return count; +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +void +mmc_latency_hist_sysfs_init(struct mmc_host *host) +{ + if (device_create_file(&host->class_dev, &dev_attr_latency_hist)) + dev_err(&host->class_dev, + "Failed to create latency_hist sysfs entry\n"); +} + +void +mmc_latency_hist_sysfs_exit(struct mmc_host *host) +{ + device_remove_file(&host->class_dev, &dev_attr_latency_hist); +} + subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index fcf7829c759e..17068839c74b 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -32,8 +32,6 @@ #include "slot-gpio.h" #include "pwrseq.h" -#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) - static DEFINE_IDR(mmc_host_idr); static DEFINE_SPINLOCK(mmc_host_lock); @@ -394,6 +392,8 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif + mmc_latency_hist_sysfs_init(host); + mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) register_pm_notifier(&host->pm_notify); @@ -422,6 +422,8 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif + mmc_latency_hist_sysfs_exit(host); + device_del(&host->class_dev); led_trigger_unregister_simple(host->led); diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h index 992bf5397633..bf38533406fd 100644 --- a/drivers/mmc/core/host.h +++ b/drivers/mmc/core/host.h @@ -12,6 +12,8 @@ #define _MMC_CORE_HOST_H #include +#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev) + int mmc_register_host_class(void); void mmc_unregister_host_class(void); @@ -21,5 +23,8 @@ void mmc_retune_hold(struct mmc_host *host); void mmc_retune_release(struct mmc_host *host); int mmc_retune(struct mmc_host *host); +void mmc_latency_hist_sysfs_init(struct mmc_host *host); +void mmc_latency_hist_sysfs_exit(struct mmc_host *host); + #endif diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 85cd2564c157..4167bdbf0ecf 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -39,6 +39,7 @@ #include #include +#include #include "ufshcd.h" #include "unipro.h" @@ -1332,6 +1333,17 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd) clear_bit_unlock(tag, &hba->lrb_in_use); goto out; } + + /* IO svc time latency histogram */ + if (hba != NULL && cmd->request != NULL) { + if (hba->latency_hist_enabled && + (cmd->request->cmd_type == REQ_TYPE_FS)) { + cmd->request->lat_hist_io_start = ktime_get(); + cmd->request->lat_hist_enabled = 1; + } else + cmd->request->lat_hist_enabled = 0; + } + WARN_ON(hba->clk_gating.state != CLKS_ON); lrbp = &hba->lrb[tag]; @@ -3160,6 +3172,7 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) u32 tr_doorbell; int result; int index; + struct request *req; /* Resetting interrupt aggregation counters first and reading the * DOOR_BELL afterward allows us to handle all the completed requests. @@ -3184,6 +3197,22 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba) /* Mark completed command as NULL in LRB */ lrbp->cmd = NULL; clear_bit_unlock(index, &hba->lrb_in_use); + req = cmd->request; + if (req) { + /* Update IO svc time latency histogram */ + if (req->lat_hist_enabled) { + ktime_t completion; + u_int64_t delta_us; + + completion = ktime_get(); + delta_us = ktime_us_delta(completion, + req->lat_hist_io_start); + /* rq_data_dir() => true if WRITE */ + blk_update_latency_hist(&hba->io_lat_s, + (rq_data_dir(req) == READ), + delta_us); + } + } /* Do not touch lrbp after scsi done */ cmd->scsi_done(cmd); __ufshcd_release(hba); @@ -5327,6 +5356,54 @@ out: } EXPORT_SYMBOL(ufshcd_shutdown); +/* + * Values permitted 0, 1, 2. + * 0 -> Disable IO latency histograms (default) + * 1 -> Enable IO latency histograms + * 2 -> Zero out IO latency histograms + */ +static ssize_t +latency_hist_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + long value; + + if (kstrtol(buf, 0, &value)) + return -EINVAL; + if (value == BLK_IO_LAT_HIST_ZERO) + blk_zero_latency_hist(&hba->io_lat_s); + else if (value == BLK_IO_LAT_HIST_ENABLE || + value == BLK_IO_LAT_HIST_DISABLE) + hba->latency_hist_enabled = value; + return count; +} + +ssize_t +latency_hist_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + return blk_latency_hist_show(&hba->io_lat_s, buf); +} + +static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, + latency_hist_show, latency_hist_store); + +static void +ufshcd_init_latency_hist(struct ufs_hba *hba) +{ + if (device_create_file(hba->dev, &dev_attr_latency_hist)) + dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n"); +} + +static void +ufshcd_exit_latency_hist(struct ufs_hba *hba) +{ + device_create_file(hba->dev, &dev_attr_latency_hist); +} + /** * ufshcd_remove - de-allocate SCSI host and host memory space * data structure memory @@ -5342,6 +5419,7 @@ void ufshcd_remove(struct ufs_hba *hba) scsi_host_put(hba->host); ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); if (ufshcd_is_clkscaling_enabled(hba)) devfreq_remove_device(hba->devfreq); ufshcd_hba_exit(hba); @@ -5639,6 +5717,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) /* Hold auto suspend until async scan completes */ pm_runtime_get_sync(dev); + ufshcd_init_latency_hist(hba); + /* * The device-initialize-sequence hasn't been invoked yet. * Set the device to power-off state @@ -5653,6 +5733,7 @@ out_remove_scsi_host: scsi_remove_host(hba->host); exit_gating: ufshcd_exit_clk_gating(hba); + ufshcd_exit_latency_hist(hba); out_disable: hba->is_irq_enabled = false; scsi_host_put(host); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 2570d9477b37..f3780cf7d895 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -532,6 +532,9 @@ struct ufs_hba { struct devfreq *devfreq; struct ufs_clk_scaling clk_scaling; bool is_sys_suspended; + + int latency_hist_enabled; + struct io_latency_state io_lat_s; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c70e3588a48c..faf2c0093527 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -197,6 +197,9 @@ struct request { /* for bidi */ struct request *next_rq; + + ktime_t lat_hist_io_start; + int lat_hist_enabled; }; static inline unsigned short req_get_ioprio(struct request *req) @@ -1645,6 +1648,79 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); extern long bdev_direct_access(struct block_device *, sector_t, void __pmem **addr, unsigned long *pfn, long size); + +/* + * X-axis for IO latency histogram support. + */ +static const u_int64_t latency_x_axis_us[] = { + 100, + 200, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1200, + 1400, + 1600, + 1800, + 2000, + 2500, + 3000, + 4000, + 5000, + 6000, + 7000, + 9000, + 10000 +}; + +#define BLK_IO_LAT_HIST_DISABLE 0 +#define BLK_IO_LAT_HIST_ENABLE 1 +#define BLK_IO_LAT_HIST_ZERO 2 + +struct io_latency_state { + u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_reads_elems; + u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_writes_elems; +}; + +static inline void +blk_update_latency_hist(struct io_latency_state *s, + int read, + u_int64_t delta_us) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { + if (delta_us < (u_int64_t)latency_x_axis_us[i]) { + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + break; + } + } + if (i == ARRAY_SIZE(latency_x_axis_us)) { + /* Overflowed the histogram */ + if (read) + s->latency_y_axis_read[i]++; + else + s->latency_y_axis_write[i]++; + } + if (read) + s->latency_reads_elems++; + else + s->latency_writes_elems++; +} + +void blk_zero_latency_hist(struct io_latency_state *s); +ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); + #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 37967b6da03c..3349f0676acb 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -136,6 +136,8 @@ struct mmc_request { struct completion completion; void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; + ktime_t io_start; + int lat_hist_enabled; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 40025b28c1fb..e4862f7cdede 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -379,6 +380,9 @@ struct mmc_host { } embedded_sdio_data; #endif + int latency_hist_enabled; + struct io_latency_state io_lat_s; + unsigned long private[0] ____cacheline_aligned; }; From ef1cf0cfecdb16c520e32628ef8e6d9fa86fc08f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 Aug 2016 12:47:01 -0700 Subject: [PATCH 0292/3220] UPSTREAM: Make the hardened user-copy code depend on having a hardened allocator The kernel test robot reported a usercopy failure in the new hardened sanity checks, due to a page-crossing copy of the FPU state into the task structure. This happened because the kernel test robot was testing with SLOB, which doesn't actually do the required book-keeping for slab allocations, and as a result the hardening code didn't realize that the task struct allocation was one single allocation - and the sanity checks fail. Since SLOB doesn't even claim to support hardening (and you really shouldn't use it), the straightforward solution is to just make the usercopy hardening code depend on the allocator supporting it. Reported-by: kernel test robot Cc: Kees Cook Signed-off-by: Linus Torvalds Change-Id: I37d51f866f873341bf7d5297249899b852e1c6ce (cherry picked from commit 6040e57658eee6eb1315a26119101ca832d1f854) Signed-off-by: Sami Tolvanen --- security/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/security/Kconfig b/security/Kconfig index aba6a8d4d1f4..2b42c225de28 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -145,6 +145,7 @@ config HAVE_ARCH_HARDENED_USERCOPY config HARDENED_USERCOPY bool "Harden memory copies between kernel and userspace" depends on HAVE_ARCH_HARDENED_USERCOPY + depends on HAVE_HARDENED_USERCOPY_ALLOCATOR select BUG help This option checks for obviously wrong memory regions when From 0715ce3b882b9982b6acdbf9e46fccd1e6f180d7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 09:45:09 -0800 Subject: [PATCH 0293/3220] UPSTREAM: x86: reorganize SMAP handling in user space accesses This reorganizes how we do the stac/clac instructions in the user access code. Instead of adding the instructions directly to the same inline asm that does the actual user level access and exception handling, add them at a higher level. This is mainly preparation for the next step, where we will expose an interface to allow users to mark several accesses together as being user space accesses, but it does already clean up some code: - the inlined trivial cases of copy_in_user() now do stac/clac just once over the accesses: they used to do one pair around the user space read, and another pair around the write-back. - the {get,put}_user_ex() macros that are used with the catch/try handling don't do any stac/clac at all, because that happens in the try/catch surrounding them. Other than those two cleanups that happened naturally from the re-organization, this should not make any difference. Yet. Signed-off-by: Linus Torvalds Change-Id: Iaad8756bc8e95876e2e2a7d7bbd333fc478ab441 (cherry picked from commit 11f1a4b9755f5dbc3e822a96502ebe9b044b14d8) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 53 +++++++++++------ arch/x86/include/asm/uaccess_64.h | 94 ++++++++++++++++++++++--------- 2 files changed, 101 insertions(+), 46 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 32a047ce0806..be439e246d91 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -134,6 +134,9 @@ extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_bad(void); +#define __uaccess_begin() stac() +#define __uaccess_end() clac() + /* * This is a type: either unsigned long, if the argument fits into * that type, or otherwise unsigned long long. @@ -193,10 +196,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) #ifdef CONFIG_X86_32 #define __put_user_asm_u64(x, addr, err, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%2)\n" \ "2: movl %%edx,4(%2)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ ".section .fixup,\"ax\"\n" \ "4: movl %3,%0\n" \ " jmp 3b\n" \ @@ -207,10 +210,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "A" (x), "r" (addr), "i" (errret), "0" (err)) #define __put_user_asm_ex_u64(x, addr) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ _ASM_EXTABLE_EX(1b, 2b) \ _ASM_EXTABLE_EX(2b, 3b) \ : : "A" (x), "r" (addr)) @@ -304,6 +307,10 @@ do { \ } \ } while (0) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __put_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -358,9 +365,9 @@ do { \ } while (0) #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %2,%"rtype"1\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " xor"itype" %"rtype"1,%"rtype"1\n" \ @@ -370,6 +377,10 @@ do { \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __get_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -400,7 +411,9 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ int __pu_err; \ + __uaccess_begin(); \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ + __uaccess_end(); \ __builtin_expect(__pu_err, 0); \ }) @@ -408,7 +421,9 @@ do { \ ({ \ int __gu_err; \ unsigned long __gu_val; \ + __uaccess_begin(); \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __uaccess_end(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __builtin_expect(__gu_err, 0); \ }) @@ -423,9 +438,9 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %"rtype"1,%2\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " jmp 2b\n" \ @@ -445,11 +460,11 @@ struct __large_struct { unsigned long buf[100]; }; */ #define uaccess_try do { \ current_thread_info()->uaccess_err = 0; \ - stac(); \ + __uaccess_begin(); \ barrier(); #define uaccess_catch(err) \ - clac(); \ + __uaccess_end(); \ (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ } while (0) @@ -547,12 +562,13 @@ extern void __cmpxchg_wrong_size(void) __typeof__(ptr) __uval = (uval); \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ + __uaccess_begin(); \ switch (size) { \ case 1: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -566,9 +582,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 2: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -582,9 +598,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 4: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -601,9 +617,9 @@ extern void __cmpxchg_wrong_size(void) if (!IS_ENABLED(CONFIG_X86_64)) \ __cmpxchg_wrong_size(); \ \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -618,6 +634,7 @@ extern void __cmpxchg_wrong_size(void) default: \ __cmpxchg_wrong_size(); \ } \ + __uaccess_end(); \ *__uval = __old; \ __ret; \ }) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 512ffd621fc3..2957c8237c28 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -57,35 +57,49 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { - case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src, + case 1: + __uaccess_begin(); + __get_user_asm(*(u8 *)dst, (u8 __user *)src, ret, "b", "b", "=q", 1); + __uaccess_end(); return ret; - case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src, + case 2: + __uaccess_begin(); + __get_user_asm(*(u16 *)dst, (u16 __user *)src, ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; - case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src, + case 4: + __uaccess_begin(); + __get_user_asm(*(u32 *)dst, (u32 __user *)src, ret, "l", "k", "=r", 4); + __uaccess_end(); return ret; - case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src, + case 8: + __uaccess_begin(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 10); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u16 *)(8 + (char *)dst), - (u16 __user *)(8 + (char __user *)src), - ret, "w", "w", "=r", 2); + if (likely(!ret)) + __get_user_asm(*(u16 *)(8 + (char *)dst), + (u16 __user *)(8 + (char __user *)src), + ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 16); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u64 *)(8 + (char *)dst), - (u64 __user *)(8 + (char __user *)src), - ret, "q", "", "=r", 8); + if (likely(!ret)) + __get_user_asm(*(u64 *)(8 + (char *)dst), + (u64 __user *)(8 + (char __user *)src), + ret, "q", "", "=r", 8); + __uaccess_end(); return ret; default: return copy_user_generic(dst, (__force void *)src, size); @@ -108,35 +122,51 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { - case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst, + case 1: + __uaccess_begin(); + __put_user_asm(*(u8 *)src, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; - case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst, + case 2: + __uaccess_begin(); + __put_user_asm(*(u16 *)src, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; - case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst, + case 4: + __uaccess_begin(); + __put_user_asm(*(u32 *)src, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; - case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, + case 8: + __uaccess_begin(); + __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 10); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, - ret, "w", "w", "ir", 2); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, + ret, "w", "w", "ir", 2); + } + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 16); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, - ret, "q", "", "er", 8); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, + ret, "q", "", "er", 8); + } + __uaccess_end(); return ret; default: return copy_user_generic((__force void *)dst, src, size); @@ -162,39 +192,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) switch (size) { case 1: { u8 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u8 __user *)src, ret, "b", "b", "=q", 1); if (likely(!ret)) __put_user_asm(tmp, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; } case 2: { u16 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u16 __user *)src, ret, "w", "w", "=r", 2); if (likely(!ret)) __put_user_asm(tmp, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; } case 4: { u32 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u32 __user *)src, ret, "l", "k", "=r", 4); if (likely(!ret)) __put_user_asm(tmp, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; } case 8: { u64 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u64 __user *)src, ret, "q", "", "=r", 8); if (likely(!ret)) __put_user_asm(tmp, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; } default: From a5eeb9b4123eee6aa88bb7a43dba919641abeeb6 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:55:12 -0700 Subject: [PATCH 0294/3220] UPSTREAM: mm/slub: support left redzone SLUB already has a redzone debugging feature. But it is only positioned at the end of object (aka right redzone) so it cannot catch left oob. Although current object's right redzone acts as left redzone of next object, first object in a slab cannot take advantage of this effect. This patch explicitly adds a left red zone to each object to detect left oob more precisely. Background: Someone complained to me that left OOB doesn't catch even if KASAN is enabled which does page allocation debugging. That page is out of our control so it would be allocated when left OOB happens and, in this case, we can't find OOB. Moreover, SLUB debugging feature can be enabled without page allocator debugging and, in this case, we will miss that OOB. Before trying to implement, I expected that changes would be too complex, but, it doesn't look that complex to me now. Almost changes are applied to debug specific functions so I feel okay. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: Ib893a17ecabd692e6c402e864196bf89cd6781a5 (cherry picked from commit d86bd1bece6fc41d59253002db5441fe960a37f6) Signed-off-by: Sami Tolvanen --- include/linux/slub_def.h | 1 + mm/slub.c | 100 +++++++++++++++++++++++++++------------ 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 33885118523c..f4e857e920cd 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -81,6 +81,7 @@ struct kmem_cache { int reserved; /* Reserved bytes at the end of slabs */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ + int red_left_pad; /* Left redzone padding size */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif diff --git a/mm/slub.c b/mm/slub.c index 470f5e561727..cfd3579cbaeb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -224,24 +232,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -271,12 +261,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -456,6 +448,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ @@ -489,6 +497,26 @@ static inline void metadata_access_disable(void) /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { metadata_access_enable(); @@ -628,7 +656,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -645,9 +675,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -677,6 +707,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -769,11 +802,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -817,6 +850,10 @@ static int check_object(struct kmem_cache *s, struct page *page, u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { + if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; @@ -1468,7 +1505,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -3283,7 +3320,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3292,6 +3329,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* From cd55f3c2be7a55bb708ba0f5fb02ae1c08cc1d33 Mon Sep 17 00:00:00 2001 From: Riley Andrews Date: Tue, 6 Sep 2016 15:16:25 -0700 Subject: [PATCH 0295/3220] cpuset: Make cpusets restore on hotplug This deliberately changes the behavior of the per-cpuset cpus file to not be effected by hotplug. When a cpu is offlined, it will be removed from the cpuset/cpus file. When a cpu is onlined, if the cpuset originally requested that that cpu was part of the cpuset, that cpu will be restored to the cpuset. The cpus files still have to be hierachical, but the ranges no longer have to be out of the currently online cpus, just the physically present cpus. Change-Id: I22cdf33e7d312117bcefba1aeb0125e1ada289a9 Signed-off-by: Dmitry Shmidt --- kernel/cpuset.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0ff01c2a7222..3b4f27981778 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -99,6 +99,7 @@ struct cpuset { /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; + cpumask_var_t cpus_requested; nodemask_t mems_allowed; /* effective CPUs and Memory Nodes allow to tasks */ @@ -385,7 +386,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + return cpumask_subset(p->cpus_requested, q->cpus_requested) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); @@ -485,7 +486,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) + cpumask_intersects(trial->cpus_requested, c->cpus_requested)) goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && @@ -944,17 +945,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!*buf) { cpumask_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); + retval = cpulist_parse(buf, trialcs->cpus_requested); if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) + if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask)) return -EINVAL; + + cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask); } /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested)) return 0; retval = validate_change(cs, trialcs); @@ -963,6 +965,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + cpumask_copy(cs->cpus_requested, trialcs->cpus_requested); spin_unlock_irq(&callback_lock); /* use trialcs->cpus_allowed as a temp variable */ @@ -1731,7 +1734,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) switch (type) { case FILE_CPULIST: - seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested)); break; case FILE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); @@ -1920,11 +1923,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) goto free_cs; + if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) + goto free_allowed; if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) - goto free_cpus; + goto free_requested; set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); + cpumask_clear(cs->cpus_requested); nodes_clear(cs->mems_allowed); cpumask_clear(cs->effective_cpus); nodes_clear(cs->effective_mems); @@ -1933,7 +1939,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return &cs->css; -free_cpus: +free_requested: + free_cpumask_var(cs->cpus_requested); +free_allowed: free_cpumask_var(cs->cpus_allowed); free_cs: kfree(cs); @@ -1996,6 +2004,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->mems_allowed = parent->mems_allowed; cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->cpus_requested, parent->cpus_requested); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: @@ -2030,6 +2039,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->cpus_requested); kfree(cs); } @@ -2096,8 +2106,11 @@ int __init cpuset_init(void) BUG(); if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) BUG(); + if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); + cpumask_setall(top_cpuset.cpus_requested); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); nodes_setall(top_cpuset.effective_mems); @@ -2231,7 +2244,7 @@ retry: goto retry; } - cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); + cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); From 96a970e08bdec910c7780e8f823b3595eef6ba8c Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Tue, 2 Feb 2016 13:36:45 -0500 Subject: [PATCH 0296/3220] UPSTREAM: netfilter: nfnetlink: correctly validate length of batch messages (cherry picked from commit c58d6c93680f28ac58984af61d0a7ebf4319c241) If nlh->nlmsg_len is zero then an infinite loop is triggered because 'skb_pull(skb, msglen);' pulls zero bytes. The calculation in nlmsg_len() underflows if 'nlh->nlmsg_len < NLMSG_HDRLEN' which bypasses the length validation and will later trigger an out-of-bound read. If the length validation does fail then the malformed batch message is copied back to userspace. However, we cannot do this because the nlh->nlmsg_len can be invalid. This leads to an out-of-bounds read in netlink_ack: [ 41.455421] ================================================================== [ 41.456431] BUG: KASAN: slab-out-of-bounds in memcpy+0x1d/0x40 at addr ffff880119e79340 [ 41.456431] Read of size 4294967280 by task a.out/987 [ 41.456431] ============================================================================= [ 41.456431] BUG kmalloc-512 (Not tainted): kasan: bad access detected [ 41.456431] ----------------------------------------------------------------------------- ... [ 41.456431] Bytes b4 ffff880119e79310: 00 00 00 00 d5 03 00 00 b0 fb fe ff 00 00 00 00 ................ [ 41.456431] Object ffff880119e79320: 20 00 00 00 10 00 05 00 00 00 00 00 00 00 00 00 ............... [ 41.456431] Object ffff880119e79330: 14 00 0a 00 01 03 fc 40 45 56 11 22 33 10 00 05 .......@EV."3... [ 41.456431] Object ffff880119e79340: f0 ff ff ff 88 99 aa bb 00 14 00 0a 00 06 fe fb ................ ^^ start of batch nlmsg with nlmsg_len=4294967280 ... [ 41.456431] Memory state around the buggy address: [ 41.456431] ffff880119e79400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] ffff880119e79480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] >ffff880119e79500: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ^ [ 41.456431] ffff880119e79580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ffff880119e79600: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb [ 41.456431] ================================================================== Fix this with better validation of nlh->nlmsg_len and by setting NFNL_BATCH_FAILURE if any batch message fails length validation. CAP_NET_ADMIN is required to trigger the bugs. Fixes: 9ea2aa8b7dba ("netfilter: nfnetlink: validate nfnetlink header from batch") Signed-off-by: Phil Turnbull Signed-off-by: Pablo Neira Ayuso Change-Id: Id3e15c40cb464bf2791af907c235d8a316b2449c Bug: 30947055 --- net/netfilter/nfnetlink.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 77afe913d03d..9adedba78eea 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -326,10 +326,12 @@ replay: nlh = nlmsg_hdr(skb); err = 0; - if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) || - skb->len < nlh->nlmsg_len) { - err = -EINVAL; - goto ack; + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { + nfnl_err_reset(&err_list); + status |= NFNL_BATCH_FAILURE; + goto done; } /* Only requests are handled by the kernel */ From 6b93f8214eabf0f363eab5283c2ad18b5bc33135 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Fri, 27 Nov 2015 14:30:21 -0500 Subject: [PATCH 0297/3220] UPSTREAM: tty: Prevent ldisc drivers from re-using stale tty fields (cherry picked from commit dd42bf1197144ede075a9d4793123f7689e164bc) Line discipline drivers may mistakenly misuse ldisc-related fields when initializing. For example, a failure to initialize tty->receive_room in the N_GIGASET_M101 line discipline was recently found and fixed [1]. Now, the N_X25 line discipline has been discovered accessing the previous line discipline's already-freed private data [2]. Harden the ldisc interface against misuse by initializing revelant tty fields before instancing the new line discipline. [1] commit fd98e9419d8d622a4de91f76b306af6aa627aa9c Author: Tilman Schmidt Date: Tue Jul 14 00:37:13 2015 +0200 isdn/gigaset: reset tty->receive_room when attaching ser_gigaset [2] Report from Sasha Levin [ 634.336761] ================================================================== [ 634.338226] BUG: KASAN: use-after-free in x25_asy_open_tty+0x13d/0x490 at addr ffff8800a743efd0 [ 634.339558] Read of size 4 by task syzkaller_execu/8981 [ 634.340359] ============================================================================= [ 634.341598] BUG kmalloc-512 (Not tainted): kasan: bad access detected ... [ 634.405018] Call Trace: [ 634.405277] dump_stack (lib/dump_stack.c:52) [ 634.405775] print_trailer (mm/slub.c:655) [ 634.406361] object_err (mm/slub.c:662) [ 634.406824] kasan_report_error (mm/kasan/report.c:138 mm/kasan/report.c:236) [ 634.409581] __asan_report_load4_noabort (mm/kasan/report.c:279) [ 634.411355] x25_asy_open_tty (drivers/net/wan/x25_asy.c:559 (discriminator 1)) [ 634.413997] tty_ldisc_open.isra.2 (drivers/tty/tty_ldisc.c:447) [ 634.414549] tty_set_ldisc (drivers/tty/tty_ldisc.c:567) [ 634.415057] tty_ioctl (drivers/tty/tty_io.c:2646 drivers/tty/tty_io.c:2879) [ 634.423524] do_vfs_ioctl (fs/ioctl.c:43 fs/ioctl.c:607) [ 634.427491] SyS_ioctl (fs/ioctl.c:622 fs/ioctl.c:613) [ 634.427945] entry_SYSCALL_64_fastpath (arch/x86/entry/entry_64.S:188) Cc: Tilman Schmidt Cc: Sasha Levin Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman Change-Id: Ibed6feadfb9706d478f93feec3b240aecfc64af3 Bug: 30951112 --- drivers/tty/tty_ldisc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index 629e3c865072..9bee25cfa0be 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -417,6 +417,10 @@ EXPORT_SYMBOL_GPL(tty_ldisc_flush); * they are not on hot paths so a little discipline won't do * any harm. * + * The line discipline-related tty_struct fields are reset to + * prevent the ldisc driver from re-using stale information for + * the new ldisc instance. + * * Locking: takes termios_rwsem */ @@ -425,6 +429,9 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num) down_write(&tty->termios_rwsem); tty->termios.c_line = num; up_write(&tty->termios_rwsem); + + tty->disc_data = NULL; + tty->receive_room = 0; } /** From 65a2cd6d500d2620e5a5aa156b518d5fd25d492a Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Wed, 7 Sep 2016 17:39:42 -0700 Subject: [PATCH 0298/3220] Android: Fix build breakages. The IO latency histogram change broke allmodconfig and allnoconfig builds. This fixes those breakages. Change-Id: I9cdae655b40ed155468f3cef25cdb74bb56c4d3e Signed-off-by: Mohan Srinivasan --- block/blk-core.c | 2 ++ include/linux/mmc/host.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 840713279726..64f3455a83ef 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3560,6 +3560,7 @@ blk_zero_latency_hist(struct io_latency_state *s) s->latency_reads_elems = 0; s->latency_writes_elems = 0; } +EXPORT_SYMBOL(blk_zero_latency_hist); ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf) @@ -3621,3 +3622,4 @@ blk_latency_hist_show(struct io_latency_state *s, char *buf) } return bytes_written; } +EXPORT_SYMBOL(blk_latency_hist_show); diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e4862f7cdede..97b2b0b1f99d 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -380,8 +380,10 @@ struct mmc_host { } embedded_sdio_data; #endif +#ifdef CONFIG_BLOCK int latency_hist_enabled; struct io_latency_state io_lat_s; +#endif unsigned long private[0] ____cacheline_aligned; }; From 9f139d106df7b6a1805b0e4f3b234cedf25d2bbb Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 1 Jul 2016 00:39:35 -0700 Subject: [PATCH 0299/3220] UPSTREAM: block: fix use-after-free in sys_ioprio_get() (cherry picked from commit 8ba8682107ee2ca3347354e018865d8e1967c5f4) get_task_ioprio() accesses the task->io_context without holding the task lock and thus can race with exit_io_context(), leading to a use-after-free. The reproducer below hits this within a few seconds on my 4-core QEMU VM: #define _GNU_SOURCE #include #include #include #include int main(int argc, char **argv) { pid_t pid, child; long nproc, i; /* ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); */ syscall(SYS_ioprio_set, 1, 0, 0x6000); nproc = sysconf(_SC_NPROCESSORS_ONLN); for (i = 0; i < nproc; i++) { pid = fork(); assert(pid != -1); if (pid == 0) { for (;;) { pid = fork(); assert(pid != -1); if (pid == 0) { _exit(0); } else { child = wait(NULL); assert(child == pid); } } } pid = fork(); assert(pid != -1); if (pid == 0) { for (;;) { /* ioprio_get(IOPRIO_WHO_PGRP, 0); */ syscall(SYS_ioprio_get, 2, 0); } } } for (;;) { /* ioprio_get(IOPRIO_WHO_PGRP, 0); */ syscall(SYS_ioprio_get, 2, 0); } return 0; } This gets us KASAN dumps like this: [ 35.526914] ================================================================== [ 35.530009] BUG: KASAN: out-of-bounds in get_task_ioprio+0x7b/0x90 at addr ffff880066f34e6c [ 35.530009] Read of size 2 by task ioprio-gpf/363 [ 35.530009] ============================================================================= [ 35.530009] BUG blkdev_ioc (Not tainted): kasan: bad access detected [ 35.530009] ----------------------------------------------------------------------------- [ 35.530009] Disabling lock debugging due to kernel taint [ 35.530009] INFO: Allocated in create_task_io_context+0x2b/0x370 age=0 cpu=0 pid=360 [ 35.530009] ___slab_alloc+0x55d/0x5a0 [ 35.530009] __slab_alloc.isra.20+0x2b/0x40 [ 35.530009] kmem_cache_alloc_node+0x84/0x200 [ 35.530009] create_task_io_context+0x2b/0x370 [ 35.530009] get_task_io_context+0x92/0xb0 [ 35.530009] copy_process.part.8+0x5029/0x5660 [ 35.530009] _do_fork+0x155/0x7e0 [ 35.530009] SyS_clone+0x19/0x20 [ 35.530009] do_syscall_64+0x195/0x3a0 [ 35.530009] return_from_SYSCALL_64+0x0/0x6a [ 35.530009] INFO: Freed in put_io_context+0xe7/0x120 age=0 cpu=0 pid=1060 [ 35.530009] __slab_free+0x27b/0x3d0 [ 35.530009] kmem_cache_free+0x1fb/0x220 [ 35.530009] put_io_context+0xe7/0x120 [ 35.530009] put_io_context_active+0x238/0x380 [ 35.530009] exit_io_context+0x66/0x80 [ 35.530009] do_exit+0x158e/0x2b90 [ 35.530009] do_group_exit+0xe5/0x2b0 [ 35.530009] SyS_exit_group+0x1d/0x20 [ 35.530009] entry_SYSCALL_64_fastpath+0x1a/0xa4 [ 35.530009] INFO: Slab 0xffffea00019bcd00 objects=20 used=4 fp=0xffff880066f34ff0 flags=0x1fffe0000004080 [ 35.530009] INFO: Object 0xffff880066f34e58 @offset=3672 fp=0x0000000000000001 [ 35.530009] ================================================================== Fix it by grabbing the task lock while we poke at the io_context. Cc: stable@vger.kernel.org Reported-by: Dmitry Vyukov Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe Change-Id: I3f5858cc9a1b9d4124ae7a6578660dec219d2c57 Bug: 30946378 --- block/ioprio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/ioprio.c b/block/ioprio.c index cc7800e9eb44..01b8116298a1 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -150,8 +150,10 @@ static int get_task_ioprio(struct task_struct *p) if (ret) goto out; ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + task_lock(p); if (p->io_context) ret = p->io_context->ioprio; + task_unlock(p); out: return ret; } From d942dd3b7c156b005bf01c33af729ab783182f4c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 19 Jan 2016 12:34:58 +0100 Subject: [PATCH 0300/3220] UPSTREAM: HID: core: prevent out-of-bound readings (cherry picked from commit 50220dead1650609206efe91f0cc116132d59b3f) Plugging a Logitech DJ receiver with KASAN activated raises a bunch of out-of-bound readings. The fields are allocated up to MAX_USAGE, meaning that potentially, we do not have enough fields to fit the incoming values. Add checks and silence KASAN. Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina Change-Id: Iaf25e882a6696884439d7091b5fbb0b350d893d3 Bug: 30951261 --- drivers/hid/hid-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index c6f7a694f67a..fa5f81bbc95f 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1251,6 +1251,7 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field, /* Ignore report if ErrorRollOver */ if (!(field->flags & HID_MAIN_ITEM_VARIABLE) && value[n] >= min && value[n] <= max && + value[n] - min < field->maxusage && field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1) goto exit; } @@ -1263,11 +1264,13 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field, } if (field->value[n] >= min && field->value[n] <= max + && field->value[n] - min < field->maxusage && field->usage[field->value[n] - min].hid && search(value, field->value[n], count)) hid_process_event(hid, field, &field->usage[field->value[n] - min], 0, interrupt); if (value[n] >= min && value[n] <= max + && value[n] - min < field->maxusage && field->usage[value[n] - min].hid && search(field->value, value[n], count)) hid_process_event(hid, field, &field->usage[value[n] - min], 1, interrupt); From 1a4f17ec07cc4a938b6695e92e4f2b2ec7c3d527 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 6 Sep 2016 11:56:01 -0700 Subject: [PATCH 0301/3220] UPSTREAM: x86/uaccess: force copy_*_user() to be inlined As already done with __copy_*_user(), mark copy_*_user() as __always_inline. Without this, the checks for things like __builtin_const_p() won't work consistently in either hardened usercopy nor the recent adjustments for detecting usercopy overflows at compile time. The change in kernel text size is detectable, but very small: text data bss dec hex filename 12118735 5768608 14229504 32116847 1ea106f vmlinux.before 12120207 5768608 14229504 32118319 1ea162f vmlinux.after Signed-off-by: Kees Cook Change-Id: I284c85c2a782145f46655a91d4f83874c90eba61 (cherry picked from commit e6971009a95a74f28c58bbae415c40effad1226c) Signed-off-by: Sami Tolvanen --- arch/x86/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index be439e246d91..dbe64f27280e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -706,7 +706,7 @@ __copy_from_user_overflow(int size, unsigned long count) #endif -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { int sz = __compiletime_object_size(to); @@ -742,7 +742,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n) return n; } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); From 1fd70f71e8f428f0c191a6b86e72e700dbe34bed Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 31 Aug 2016 16:04:21 -0700 Subject: [PATCH 0302/3220] BACKPORT: usercopy: fold builtin_const check into inline function Instead of having each caller of check_object_size() need to remember to check for a const size parameter, move the check into check_object_size() itself. This actually matches the original implementation in PaX, though this commit cleans up the now-redundant builtin_const() calls in the various architectures. Signed-off-by: Kees Cook Change-Id: I348809399c10ffa051251866063be674d064b9ff (cherry picked from 81409e9e28058811c9ea865345e1753f8f677e44) Signed-off-by: Sami Tolvanen --- include/linux/thread_info.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 0ae29ff9ccfd..eded095fe81e 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -161,7 +161,8 @@ extern void __check_object_size(const void *ptr, unsigned long n, static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { - __check_object_size(ptr, n, to_user); + if (!__builtin_constant_p(n)) + __check_object_size(ptr, n, to_user); } #else static inline void check_object_size(const void *ptr, unsigned long n, From 93584e7e1b4ea543ed04cff643186946196eaebb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Sep 2016 09:39:32 -0700 Subject: [PATCH 0303/3220] UPSTREAM: usercopy: force check_object_size() inline Just for good measure, make sure that check_object_size() is always inlined too, as already done for copy_*_user() and __copy_*_user(). Suggested-by: Linus Torvalds Signed-off-by: Kees Cook Change-Id: Ibfdf4790d03fe426e68d9a864c55a0d1bbfb7d61 (cherry picked from commit a85d6b8242dc78ef3f4542a0f979aebcbe77fc4e) Signed-off-by: Sami Tolvanen --- include/linux/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index eded095fe81e..4cf89517783a 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -158,8 +158,8 @@ static inline int arch_within_stack_frames(const void * const stack, extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); -static inline void check_object_size(const void *ptr, unsigned long n, - bool to_user) +static __always_inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) { if (!__builtin_constant_p(n)) __check_object_size(ptr, n, to_user); From 92e04f8a13a8bb56a7544e56978663dc2e4f82cc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Sep 2016 09:54:34 -0700 Subject: [PATCH 0304/3220] UPSTREAM: usercopy: remove page-spanning test for now A custom allocator without __GFP_COMP that copies to userspace has been found in vmw_execbuf_process[1], so this disables the page-span checker by placing it behind a CONFIG for future work where such things can be tracked down later. [1] https://bugzilla.redhat.com/show_bug.cgi?id=1373326 Reported-by: Vinson Lee Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Kees Cook Change-Id: I4177c0fb943f14a5faf5c70f5e54bf782c316f43 (cherry picked from commit 8e1f74ea02cf4562404c48c6882214821552c13f) Signed-off-by: Sami Tolvanen --- mm/usercopy.c | 61 +++++++++++++++++++++++++++--------------------- security/Kconfig | 11 +++++++++ 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/mm/usercopy.c b/mm/usercopy.c index f78015e8b1e5..b34996a3860b 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -135,30 +135,15 @@ static inline const char *check_bogus_address(const void *ptr, unsigned long n) return NULL; } -static inline const char *check_heap_object(const void *ptr, unsigned long n, - bool to_user) +/* Checks for allocs that are marked in some way as spanning multiple pages. */ +static inline const char *check_page_span(const void *ptr, unsigned long n, + struct page *page, bool to_user) { - struct page *page, *endpage; +#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN const void *end = ptr + n - 1; + struct page *endpage; bool is_reserved, is_cma; - /* - * Some architectures (arm64) return true for virt_addr_valid() on - * vmalloced addresses. Work around this by checking for vmalloc - * first. - */ - if (is_vmalloc_addr(ptr)) - return NULL; - - if (!virt_addr_valid(ptr)) - return NULL; - - page = virt_to_head_page(ptr); - - /* Check slab allocator for flags and size. */ - if (PageSlab(page)) - return __check_heap_object(ptr, n, page); - /* * Sometimes the kernel data regions are not marked Reserved (see * check below). And sometimes [_sdata,_edata) does not cover @@ -187,7 +172,7 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, ((unsigned long)end & (unsigned long)PAGE_MASK))) return NULL; - /* Allow if start and end are inside the same compound page. */ + /* Allow if fully inside the same compound (__GFP_COMP) page. */ endpage = virt_to_head_page(end); if (likely(endpage == page)) return NULL; @@ -200,20 +185,44 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, is_reserved = PageReserved(page); is_cma = is_migrate_cma_page(page); if (!is_reserved && !is_cma) - goto reject; + return ""; for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { page = virt_to_head_page(ptr); if (is_reserved && !PageReserved(page)) - goto reject; + return ""; if (is_cma && !is_migrate_cma_page(page)) - goto reject; + return ""; } +#endif return NULL; +} -reject: - return ""; +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + */ + if (is_vmalloc_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); + + /* Verify object does not incorrectly span multiple pages. */ + return check_page_span(ptr, n, page, to_user); } /* diff --git a/security/Kconfig b/security/Kconfig index 2b42c225de28..3aa60791f84d 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -156,6 +156,17 @@ config HARDENED_USERCOPY or are part of the kernel text. This kills entire classes of heap overflow exploits and similar kernel memory exposures. +config HARDENED_USERCOPY_PAGESPAN + bool "Refuse to copy allocations that span multiple pages" + depends on HARDENED_USERCOPY + depends on !COMPILE_TEST + help + When a multi-page allocation is done without __GFP_COMP, + hardened usercopy will reject attempts to copy it. There are, + however, several cases of this in the kernel that have not all + been removed. This config is intended to be used only while + trying to find such users. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig From a38338f1cd595f6a66870ba46cc6a4ae7378c084 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Wed, 31 Aug 2016 08:09:04 -0700 Subject: [PATCH 0305/3220] FROMLIST: pstore: drop pmsg bounce buffer (from https://lkml.org/lkml/2016/9/1/428) (cherry pick from android-3.10 commit b58133100b38f2bf83cad2d7097417a3a196ed0b) Removing a bounce buffer copy operation in the pmsg driver path is always better. We also gain in overall performance by not requesting a vmalloc on every write as this can cause precious RT tasks, such as user facing media operation, to stall while memory is being reclaimed. Added a write_buf_user to the pstore functions, a backup platform write_buf_user that uses the small buffer that is part of the instance, and implemented a ramoops write_buf_user that only supports PSTORE_TYPE_PMSG. Signed-off-by: Mark Salyzyn Bug: 31057326 Change-Id: I4cdee1cd31467aa3e6c605bce2fbd4de5b0f8caa --- fs/pstore/platform.c | 36 +++++++++++++++++++++++++++++ fs/pstore/pmsg.c | 35 +++++----------------------- fs/pstore/ram.c | 19 +++++++++++++++ fs/pstore/ram_core.c | 47 ++++++++++++++++++++++++++++++++++++-- include/linux/pstore.h | 11 ++++++--- include/linux/pstore_ram.h | 7 ++++-- 6 files changed, 119 insertions(+), 36 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 588461bb2dd4..40a0fe0a4e05 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -431,6 +431,40 @@ static int pstore_write_compat(enum pstore_type_id type, size, psi); } +static int pstore_write_buf_user_compat(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char __user *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + unsigned long flags = 0; + size_t i, bufsize = size; + long ret = 0; + + if (unlikely(!access_ok(VERIFY_READ, buf, size))) + return -EFAULT; + if (bufsize > psinfo->bufsize) + bufsize = psinfo->bufsize; + spin_lock_irqsave(&psinfo->buf_lock, flags); + for (i = 0; i < size; ) { + size_t c = min(size - i, bufsize); + + ret = __copy_from_user(psinfo->buf, buf + i, c); + if (unlikely(ret != 0)) { + ret = -EFAULT; + break; + } + ret = psi->write_buf(type, reason, id, part, psinfo->buf, + compressed, c, psi); + if (unlikely(ret < 0)) + break; + i += c; + } + spin_unlock_irqrestore(&psinfo->buf_lock, flags); + return unlikely(ret < 0) ? ret : size; +} + /* * platform specific persistent storage driver registers with * us here. If pstore is already mounted, call the platform @@ -453,6 +487,8 @@ int pstore_register(struct pstore_info *psi) if (!psi->write) psi->write = pstore_write_compat; + if (!psi->write_buf_user) + psi->write_buf_user = pstore_write_buf_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 7de20cd3797f..78f6176c020f 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -19,48 +19,25 @@ #include "internal.h" static DEFINE_MUTEX(pmsg_lock); -#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE) static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - size_t i, buffer_size; - char *buffer; + u64 id; + int ret; if (!count) return 0; + /* check outside lock, page in any data. write_buf_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; - buffer_size = count; - if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE) - buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE; - buffer = vmalloc(buffer_size); - if (!buffer) - return -ENOMEM; - mutex_lock(&pmsg_lock); - for (i = 0; i < count; ) { - size_t c = min(count - i, buffer_size); - u64 id; - long ret; - - ret = __copy_from_user(buffer, buf + i, c); - if (unlikely(ret != 0)) { - mutex_unlock(&pmsg_lock); - vfree(buffer); - return -EFAULT; - } - psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c, - psinfo); - - i += c; - } - + ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count, + psinfo); mutex_unlock(&pmsg_lock); - vfree(buffer); - return count; + return ret ? ret : count; } static const struct file_operations pmsg_fops = { diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 414041342a99..5b10c2b4146c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -331,6 +331,24 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, return 0; } +static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char __user *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + if (type == PSTORE_TYPE_PMSG) { + struct ramoops_context *cxt = psi->data; + + if (!cxt->mprz) + return -ENOMEM; + return persistent_ram_write_user(cxt->mprz, buf, size); + } + + return -EINVAL; +} + static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi) { @@ -369,6 +387,7 @@ static struct ramoops_context oops_cxt = { .open = ramoops_pstore_open, .read = ramoops_pstore_read, .write_buf = ramoops_pstore_write_buf, + .write_buf_user = ramoops_pstore_write_buf_user, .erase = ramoops_pstore_erase, }, }; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 76c3f80efdfa..aa9afe573155 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -17,15 +17,16 @@ #include #include #include -#include #include #include +#include #include #include +#include #include #include +#include #include -#include #include struct persistent_ram_buffer { @@ -303,6 +304,16 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz, persistent_ram_update_ecc(prz, start, count); } +static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int start, unsigned int count) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ? + -EFAULT : 0; + persistent_ram_update_ecc(prz, start, count); + return ret; +} + void persistent_ram_save_old(struct persistent_ram_zone *prz) { struct persistent_ram_buffer *buffer = prz->buffer; @@ -356,6 +367,38 @@ int notrace persistent_ram_write(struct persistent_ram_zone *prz, return count; } +int notrace persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count) +{ + int rem, ret = 0, c = count; + size_t start; + + if (unlikely(!access_ok(VERIFY_READ, s, count))) + return -EFAULT; + if (unlikely(c > prz->buffer_size)) { + s += c - prz->buffer_size; + c = prz->buffer_size; + } + + buffer_size_add(prz, c); + + start = buffer_start_add(prz, c); + + rem = prz->buffer_size - start; + if (unlikely(rem < c)) { + ret = persistent_ram_update_user(prz, s, start, rem); + s += rem; + c -= rem; + start = 0; + } + if (likely(!ret)) + ret = persistent_ram_update_user(prz, s, start, c); + + persistent_ram_update_header_ecc(prz); + + return unlikely(ret) ? ret : count; +} + size_t persistent_ram_old_size(struct persistent_ram_zone *prz) { return prz->old_log_size; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 831479f8df8f..5cae2c6c90ad 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -22,12 +22,13 @@ #ifndef _LINUX_PSTORE_H #define _LINUX_PSTORE_H -#include +#include +#include #include #include -#include #include -#include +#include +#include /* types */ enum pstore_type_id { @@ -67,6 +68,10 @@ struct pstore_info { enum kmsg_dump_reason reason, u64 *id, unsigned int part, const char *buf, bool compressed, size_t size, struct pstore_info *psi); + int (*write_buf_user)(enum pstore_type_id type, + enum kmsg_dump_reason reason, u64 *id, + unsigned int part, const char __user *buf, + bool compressed, size_t size, struct pstore_info *psi); int (*erase)(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi); diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h index 712757f320a4..45ac5a0d29ee 100644 --- a/include/linux/pstore_ram.h +++ b/include/linux/pstore_ram.h @@ -17,11 +17,12 @@ #ifndef __LINUX_PSTORE_RAM_H__ #define __LINUX_PSTORE_RAM_H__ +#include #include +#include #include #include #include -#include struct persistent_ram_buffer; struct rs_control; @@ -59,7 +60,9 @@ void persistent_ram_free(struct persistent_ram_zone *prz); void persistent_ram_zap(struct persistent_ram_zone *prz); int persistent_ram_write(struct persistent_ram_zone *prz, const void *s, - unsigned int count); + unsigned int count); +int persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count); void persistent_ram_save_old(struct persistent_ram_zone *prz); size_t persistent_ram_old_size(struct persistent_ram_zone *prz); From 81330959ee15a682fbefcdd4f5350a979d8e8fdb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 8 Sep 2016 16:43:21 -0700 Subject: [PATCH 0306/3220] cpufreq: Kconfig: Fixup incorrect selection by CPU_FREQ_DEFAULT_GOV_SCHED The CPU_FREQ_DEFAULT_GOV_SCHED option is incorrectly selecting CPU_FREQ_GOV_INTERACTIVE, when it should be selecting CPU_FREQ_GOV_SCHED. Signed-off-by: John Stultz --- drivers/cpufreq/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index d423f51d3276..a2d63d86bb70 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -105,7 +105,7 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE config CPU_FREQ_DEFAULT_GOV_SCHED bool "sched" - select CPU_FREQ_GOV_INTERACTIVE + select CPU_FREQ_GOV_SCHED help Use the CPUfreq governor 'sched' as default. This scales cpu frequency using CPU utilization estimates from the From 1c81f45725d2cee3c11e1d213eb30cc7f8feea0a Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 25 Aug 2016 11:06:37 +0530 Subject: [PATCH 0307/3220] sched/walt: include missing header for arm_timer_read_counter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include clocksource/arm_arch_timer.h to fix implicit function declaration of ‘arch_timer_read_counter’ build error for ARCH=arm. Signed-off-by: Amit Pundir [jstultz: Cherry-picked from common/android-3.18] Signed-off-by: John Stultz --- kernel/sched/walt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index d9d09914ce30..07b7f84b37e2 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "sched.h" #include "walt.h" From 0f959e0a3261539eb14420d02a491188cc52f9e8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 15 Mar 2016 12:14:49 +0100 Subject: [PATCH 0308/3220] BACKPORT: ALSA: usb-audio: Minor code cleanup in create_fixed_stream_quirk() (cherry picked from commit 902eb7fd1e4af3ac69b9b30f8373f118c92b9729) Just a minor code cleanup: unify the error paths. Signed-off-by: Takashi Iwai Change-Id: I8253a86235df2ac1258153c9e128fa158527567f Bug: 30952477 --- sound/usb/quirks.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index b6c0c8e3b450..a85b1953d83a 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -167,16 +167,12 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, stream = (fp->endpoint & USB_DIR_IN) ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK; err = snd_usb_add_audio_stream(chip, stream, fp); - if (err < 0) { - kfree(fp); - kfree(rate_table); - return err; - } + if (err < 0) + goto error; if (fp->iface != get_iface_desc(&iface->altsetting[0])->bInterfaceNumber || fp->altset_idx >= iface->num_altsetting) { - kfree(fp); - kfree(rate_table); - return -EINVAL; + err = -EINVAL; + goto error; } alts = &iface->altsetting[fp->altset_idx]; altsd = get_iface_desc(alts); @@ -190,6 +186,11 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, snd_usb_init_pitch(chip, fp->iface, alts, fp); snd_usb_init_sample_rate(chip, fp->iface, alts, fp, fp->rate_max); return 0; + + error: + kfree(fp); + kfree(rate_table); + return err; } static int create_auto_pcm_quirk(struct snd_usb_audio *chip, From 571763be0e7f0aab3aa3eb02bb315b63539cc10c Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Thu, 31 Mar 2016 12:05:43 -0400 Subject: [PATCH 0309/3220] UPSTREAM: ALSA: usb-audio: Fix double-free in error paths after snd_usb_add_audio_stream() call (cherry picked from commit 836b34a935abc91e13e63053d0a83b24dfb5ea78) create_fixed_stream_quirk(), snd_usb_parse_audio_interface() and create_uaxx_quirk() functions allocate the audioformat object by themselves and free it upon error before returning. However, once the object is linked to a stream, it's freed again in snd_usb_audio_pcm_free(), thus it'll be double-freed, eventually resulting in a memory corruption. This patch fixes these failures in the error paths by unlinking the audioformat object before freeing it. Based on a patch by Takashi Iwai [Note for stable backports: this patch requires the commit 902eb7fd1e4a ('ALSA: usb-audio: Minor code cleanup in create_fixed_stream_quirk()')] Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1283358 Reported-by: Ralf Spenneberg Cc: # see the note above Signed-off-by: Vladis Dronov Signed-off-by: Takashi Iwai Change-Id: I7073a17d8c99886d2f6ed7981892712ba7dd5873 Bug: 30952477 --- sound/usb/quirks.c | 4 ++++ sound/usb/stream.c | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index a85b1953d83a..79c1cccc606a 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -150,6 +150,7 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, usb_audio_err(chip, "cannot memdup\n"); return -ENOMEM; } + INIT_LIST_HEAD(&fp->list); if (fp->nr_rates > MAX_NR_RATES) { kfree(fp); return -EINVAL; @@ -188,6 +189,7 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, return 0; error: + list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp); kfree(rate_table); return err; @@ -463,6 +465,7 @@ static int create_uaxx_quirk(struct snd_usb_audio *chip, fp->ep_attr = get_endpoint(alts, 0)->bmAttributes; fp->datainterval = 0; fp->maxpacksize = le16_to_cpu(get_endpoint(alts, 0)->wMaxPacketSize); + INIT_LIST_HEAD(&fp->list); switch (fp->maxpacksize) { case 0x120: @@ -486,6 +489,7 @@ static int create_uaxx_quirk(struct snd_usb_audio *chip, ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK; err = snd_usb_add_audio_stream(chip, stream, fp); if (err < 0) { + list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp); return err; } diff --git a/sound/usb/stream.c b/sound/usb/stream.c index 8ee14f2365e7..3b23102230c0 100644 --- a/sound/usb/stream.c +++ b/sound/usb/stream.c @@ -316,7 +316,9 @@ static struct snd_pcm_chmap_elem *convert_chmap(int channels, unsigned int bits, /* * add this endpoint to the chip instance. * if a stream with the same endpoint already exists, append to it. - * if not, create a new pcm stream. + * if not, create a new pcm stream. note, fp is added to the substream + * fmt_list and will be freed on the chip instance release. do not free + * fp or do remove it from the substream fmt_list to avoid double-free. */ int snd_usb_add_audio_stream(struct snd_usb_audio *chip, int stream, @@ -677,6 +679,7 @@ int snd_usb_parse_audio_interface(struct snd_usb_audio *chip, int iface_no) * (fp->maxpacksize & 0x7ff); fp->attributes = parse_uac_endpoint_attributes(chip, alts, protocol, iface_no); fp->clock = clock; + INIT_LIST_HEAD(&fp->list); /* some quirks for attributes here */ @@ -725,6 +728,7 @@ int snd_usb_parse_audio_interface(struct snd_usb_audio *chip, int iface_no) dev_dbg(&dev->dev, "%u:%d: add audio endpoint %#x\n", iface_no, altno, fp->endpoint); err = snd_usb_add_audio_stream(chip, stream, fp); if (err < 0) { + list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp->rate_table); kfree(fp->chmap); kfree(fp); From 6c7e03dde14e5074134f47d9ec3dea7de24e4e58 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 29 Aug 2016 19:48:17 +0530 Subject: [PATCH 0310/3220] DEBUG: cpufreq: fix cpu_capacity tracing build for non-smp systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpu curr capacity can only be traced for SMP systems. Non-SMP builds will fail with: drivers/cpufreq/cpufreq.c: In function ‘cpufreq_freq_transition_begin’: drivers/cpufreq/cpufreq.c:438:22: error: implicit declaration of function ‘capacity_curr_of’ [-Werror=implicit-function-declaration] trace_cpu_capacity(capacity_curr_of(cpu), cpu); ^ Fixes: Change-Id: Icd0930d11068fcb7d2b6a9a48e7ed974904e1081 ("DEBUG: sched,cpufreq: add cpu_capacity change tracepoint") Signed-off-by: Amit Pundir [jstultz: Cherry-picked from common/android-3.18] Signed-off-by: John Stultz --- drivers/cpufreq/cpufreq.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7264820e6443..7b728143440d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -29,7 +29,9 @@ #include #include #include +#ifdef CONFIG_SMP #include +#endif #include static LIST_HEAD(cpufreq_policy_list); @@ -474,7 +476,9 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy, void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { +#ifdef CONFIG_SMP int cpu; +#endif /* * Catch double invocations of _begin() which lead to self-deadlock. @@ -503,8 +507,10 @@ wait: spin_unlock(&policy->transition_lock); scale_freq_capacity(policy, freqs); +#ifdef CONFIG_SMP for_each_cpu(cpu, policy->cpus) trace_cpu_capacity(capacity_curr_of(cpu), cpu); +#endif cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } From aeb4a3112e516ef93069acdc04b65e9ae5882d28 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Wed, 24 Aug 2016 11:52:17 +0530 Subject: [PATCH 0311/3220] sched/walt: use do_div instead of division operator Use do_div() instead of "/" operator to fix undefined references to "__aeabi_uldivmod" build error for ARCH=arm. Also in TP_fast_assign(), along with do_div() usage, replace "," with ";" which would have resulted in a syntax error (!), because '#define TP_fast_assign(args...) args' would have stripped off the "," and left white space between these two assignments after CPP phase. Signed-off-by: Amit Pundir [jstultz: Cherry-picked from common/android-3.18] Signed-off-by: John Stultz --- include/trace/events/sched.h | 3 ++- kernel/sched/sched.h | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c50310a7fd6d..dffaffab4bc8 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -1057,7 +1057,8 @@ TRACE_EVENT(walt_update_history, __entry->samples = samples; __entry->evt = evt; __entry->demand = p->ravg.demand; - __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window, + __entry->walt_avg = (__entry->demand << 10); + do_div(__entry->walt_avg, walt_ravg_window); __entry->pelt_avg = p->se.avg.util_avg; memcpy(__entry->hist, p->ravg.sum_history, RAVG_HIST_SIZE_MAX * sizeof(u32)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4e2e44e3cc7b..06f15724a639 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1577,9 +1577,10 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long capacity = capacity_orig_of(cpu); #ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) - util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) / - walt_ravg_window; + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; + do_div(util, walt_ravg_window); + } #endif delta += util; if (delta < 0) From ad607cd620b8caeebf3ed3336fa80921c588b27f Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 4 May 2016 18:56:45 -0700 Subject: [PATCH 0312/3220] arm: Fix build error "conflicting types for 'scale_cpu_capacity'" Commit "arm: Update arch_scale_cpu_capacity() to reflect change to define" introduced a dependency on struct sched_domain in arch/arm/include/asm/topologoy.h, but that structure is only currently defined if CONFIG_CPU_FREQ is enabled, which causes include/linux/cpufreq.h to get pulled in which defines it. Include regardless of CONFIG_CPU_FREQ so struct sched_domain is always defined. Fixes: Change-Id: I372bd5e4c1e203428d72b18c8a806b06f3567ef6 ("arm: Update arch_scale_cpu_capacity() to reflect change to define") Signed-off-by: Steve Muckle Signed-off-by: Amit Pundir [jstultz: Cherry-picked from android-3.18] Signed-off-by: John Stultz --- arch/arm/include/asm/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index e3e596cbb1a7..d06064120694 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -3,6 +3,7 @@ #ifdef CONFIG_ARM_CPU_TOPOLOGY +#include #include struct cputopo_arm { @@ -25,7 +26,6 @@ void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef CONFIG_CPU_FREQ -#include #define arch_scale_freq_capacity cpufreq_scale_freq_capacity #endif #define arch_scale_cpu_capacity scale_cpu_capacity From 23eeab46c8550123a5e66b7b11862118dbb05d69 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Thu, 2 Jun 2016 12:18:08 +0000 Subject: [PATCH 0313/3220] arm: Fix #if/#ifdef typo in topology.c Probably a typo in arch/arm/kernel/topology.c This patch fixes the warning... arch/arm/kernel/topology.c: In function 'scale_cpu_capacity': arch/arm/kernel/topology.c:47:5: warning: "CONFIG_CPU_FREQ" is not defined [-Wundef] Fixes: Change-Id: If5e9e0ba8ff5a5d3236b373dbce8c72ea71b5e18 ("arm: Enable max freq invariant scheduler load-tracking and capacity support") Signed-off-by: Jon Medhurst Signed-off-by: Amit Pundir [jstultz: Cherry-picked from android-3.18] Signed-off-by: John Stultz --- arch/arm/kernel/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index f5941004efba..4f2c51ef162d 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -44,7 +44,7 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale); unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { -#if CONFIG_CPU_FREQ +#ifdef CONFIG_CPU_FREQ unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; From fb3cce0136514c65c2d1a9d2285a357cfb96d54c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Aug 2016 11:02:29 +0100 Subject: [PATCH 0314/3220] FIXUP: sched/tune: add fixes missing from a previous patch The previous patch: e7ce26f - FIXUP: sched/tune: fix accounting for runnable tasks squashed together patches of a series to fix SchedTune's accounting issues. However, in the consolidation and cleanup of the series to merge in the Android Common Kernel, we somehow missed a couple of important changes: 1) the schedtune_exit function is not more required, because e7ce26f fixes accounting of exiting tasks in a different way 2) the schedtune_initialized flag was not set at the end of scheddtune_init_cgroup() thus failing to enabled SchedTune at boot. This patch thus is to be considered an integration of e7ce26f. Signed-off-by: Patrick Bellasi [jstultz: Cherry-picked from android-3.18. It should be noted that some of this patch was already applied in the 4.4 patches (schedtune_exit doesn't exist for example), but this patch just ensures things are totally synced up] Signed-off-by: John Stultz --- kernel/sched/tune.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index bd7f319ce53e..505d7b35b0e1 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -715,7 +715,7 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, -// .allow_attach = schedtune_allow_attach, + .allow_attach = schedtune_allow_attach, .can_attach = schedtune_can_attach, .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, @@ -736,6 +736,8 @@ schedtune_init_cgroups(void) pr_info("schedtune: configured to support %d boost groups\n", BOOSTGROUPS_COUNT); + + schedtune_initialized = true; } #else /* CONFIG_CGROUP_SCHEDTUNE */ From a85045c03470e88923ed98bb5917b289fa752030 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Aug 2016 11:27:27 +0100 Subject: [PATCH 0315/3220] FIXUP: sched/tune: update accouting before CPU capacity The SchedTune tasks accounting is used to identify how many tasks are in a boostgroup and thus to bias the selection of an OPP based on the maximum boost value of the active boostgroups. The current implementation however update the accounting after CPU capacity has been update. This has two effects: a) when we enqueue a boosted task, we do not immediately boost its CPU b) when we dequeue a boosted task, we can keep a CPU boosted even if not required This patch change the order of the SchedTune accounting and SchedFreq updated to ensure to have always an updated representation of which boosted tasks are runnable on a CPU before updating its capacity. Reported-by: Leo Yan Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c362be1df195..2cd3e2671f16 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4240,6 +4240,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); + if (!se) { walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && @@ -4259,9 +4278,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_capacity_of(cpu_of(rq)); } - /* Update SchedTune accouting */ - schedtune_enqueue_task(p, cpu_of(rq)); - #endif /* CONFIG_SMP */ hrtick_update(rq); } @@ -4327,6 +4343,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + if (!se) { walt_dec_cumulative_runnable_avg(rq, p); @@ -4346,9 +4371,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } } - /* Update SchedTune accouting */ - schedtune_dequeue_task(p, cpu_of(rq)); - #endif /* CONFIG_SMP */ hrtick_update(rq); From 1510827f29ced43ac686c4625ce709e0dfb1292d Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Tue, 29 Dec 2015 04:47:00 +0800 Subject: [PATCH 0316/3220] UPSTREAM: ARM: 8494/1: mm: Enable PXN when running non-LPAE kernel on LPAE processor The VMSA field of MMFR0 (bottom 4 bits) is incremented for each added feature. PXN is supported if the value is >= 4 and LPAE is supported if it is >= 5. In case a kernel with CONFIG_ARM_LPAE disabled is used on a processor that supports LPAE, we can still use PXN in short descriptors. So check for >= 4 not == 4. Signed-off-by: Jungseung Lee Acked-by: Catalin Marinas Signed-off-by: Ben Hutchings Signed-off-by: Russell King --- arch/arm/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 4867f5daf82c..de9f8921e407 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -572,7 +572,7 @@ static void __init build_mem_type_table(void) * in the Short-descriptor translation table format descriptors. */ if (cpu_arch == CPU_ARCH_ARMv7 && - (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) == 4) { + (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) { user_pmd_table |= PMD_PXNTABLE; } #endif From 702ea26d1f7d421a31831de98ea02e92c6782c20 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 19 Jul 2016 17:42:57 -0400 Subject: [PATCH 0317/3220] UPSTREAM: audit: fix a double fetch in audit_log_single_execve_arg() (cherry picked from commit 43761473c254b45883a64441dd0bc85a42f3645c) There is a double fetch problem in audit_log_single_execve_arg() where we first check the execve(2) argumnets for any "bad" characters which would require hex encoding and then re-fetch the arguments for logging in the audit record[1]. Of course this leaves a window of opportunity for an unsavory application to munge with the data. This patch reworks things by only fetching the argument data once[2] into a buffer where it is scanned and logged into the audit records(s). In addition to fixing the double fetch, this patch improves on the original code in a few other ways: better handling of large arguments which require encoding, stricter record length checking, and some performance improvements (completely unverified, but we got rid of some strlen() calls, that's got to be a good thing). As part of the development of this patch, I've also created a basic regression test for the audit-testsuite, the test can be tracked on GitHub at the following link: * https://github.com/linux-audit/audit-testsuite/issues/25 [1] If you pay careful attention, there is actually a triple fetch problem due to a strnlen_user() call at the top of the function. [2] This is a tiny white lie, we do make a call to strnlen_user() prior to fetching the argument data. I don't like it, but due to the way the audit record is structured we really have no choice unless we copy the entire argument at once (which would require a rather wasteful allocation). The good news is that with this patch the kernel no longer relies on this strnlen_user() value for anything beyond recording it in the log, we also update it with a trustworthy value whenever possible. Reported-by: Pengfei Wang Cc: Signed-off-by: Paul Moore Signed-off-by: Sasha Levin Change-Id: I10e979e94605e3cf8d461e3e521f8f9837228aa5 Bug: 30956807 --- kernel/auditsc.c | 332 +++++++++++++++++++++++------------------------ 1 file changed, 164 insertions(+), 168 deletions(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b86cc04959de..48f45987dc6c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include "audit.h" @@ -82,7 +83,8 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ @@ -988,184 +990,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) -{ - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { - send_sig(SIGKILL, current, 0); - return -1; - } - - /* walk the whole argument looking for non-ascii chars */ - do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } - - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} - static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab) { - int i, len; - size_t len_sent = 0; - const char __user *p; + long len_max; + long len_rem; + long len_full; + long len_buf; + long len_abuf; + long len_tmp; + bool require_data; + bool encode; + unsigned int iter; + unsigned int arg; + char *buf_head; char *buf; + const char __user *p = (const char __user *)current->mm->arg_start; - p = (const char __user *)current->mm->arg_start; + /* NOTE: this buffer needs to be large enough to hold all the non-arg + * data we put in the audit record for this argument (see the + * code below) ... at this point in time 96 is plenty */ + char abuf[96]; - audit_log_format(*ab, "argc=%d", context->execve.argc); + /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the + * current value of 7500 is not as important as the fact that it + * is less than 8k, a setting of 7500 gives us plenty of wiggle + * room if we go over a little bit in the logging below */ + WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); + len_max = MAX_EXECVE_AUDIT_LEN; - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { + /* scratch buffer to hold the userspace args */ + buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf_head) { audit_panic("out of memory for argv string"); return; } + buf = buf_head; - for (i = 0; i < context->execve.argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); + audit_log_format(*ab, "argc=%d", context->execve.argc); + + len_rem = len_max; + len_buf = 0; + len_full = 0; + require_data = true; + encode = false; + iter = 0; + arg = 0; + do { + /* NOTE: we don't ever want to trust this value for anything + * serious, but the audit record format insists we + * provide an argument length for really long arguments, + * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but + * to use strncpy_from_user() to obtain this value for + * recording in the log, although we don't use it + * anywhere here to avoid a double-fetch problem */ + if (len_full == 0) + len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* read more data from userspace */ + if (require_data) { + /* can we make more room in the buffer? */ + if (buf != buf_head) { + memmove(buf_head, buf, len_buf); + buf = buf_head; + } + + /* fetch as much as we can of the argument */ + len_tmp = strncpy_from_user(&buf_head[len_buf], p, + len_max - len_buf); + if (len_tmp == -EFAULT) { + /* unable to copy from userspace */ + send_sig(SIGKILL, current, 0); + goto out; + } else if (len_tmp == (len_max - len_buf)) { + /* buffer is not large enough */ + require_data = true; + /* NOTE: if we are going to span multiple + * buffers force the encoding so we stand + * a chance at a sane len_full value and + * consistent record encoding */ + encode = true; + len_full = len_full * 2; + p += len_tmp; + } else { + require_data = false; + if (!encode) + encode = audit_string_contains_control( + buf, len_tmp); + /* try to use a trusted value for len_full */ + if (len_full < len_max) + len_full = (encode ? + len_tmp * 2 : len_tmp); + p += len_tmp + 1; + } + len_buf += len_tmp; + buf_head[len_buf] = '\0'; + + /* length of the buffer in the audit record? */ + len_abuf = (encode ? len_buf * 2 : len_buf + 2); + } + + /* write as much as we can to the audit log */ + if (len_buf > 0) { + /* NOTE: some magic numbers here - basically if we + * can't fit a reasonable amount of data into the + * existing audit buffer, flush it and start with + * a new buffer */ + if ((sizeof(abuf) + 8) > len_rem) { + len_rem = len_max; + audit_log_end(*ab); + *ab = audit_log_start(context, + GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + goto out; + } + + /* create the non-arg portion of the arg record */ + len_tmp = 0; + if (require_data || (iter > 0) || + ((len_abuf + sizeof(abuf)) > len_rem)) { + if (iter == 0) { + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d_len=%lu", + arg, len_full); + } + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d[%d]=", arg, iter++); + } else + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d=", arg); + WARN_ON(len_tmp >= sizeof(abuf)); + abuf[sizeof(abuf) - 1] = '\0'; + + /* log the arg in the audit record */ + audit_log_format(*ab, "%s", abuf); + len_rem -= len_tmp; + len_tmp = len_buf; + if (encode) { + if (len_abuf > len_rem) + len_tmp = len_rem / 2; /* encoding */ + audit_log_n_hex(*ab, buf, len_tmp); + len_rem -= len_tmp * 2; + len_abuf -= len_tmp * 2; + } else { + if (len_abuf > len_rem) + len_tmp = len_rem - 2; /* quotes */ + audit_log_n_string(*ab, buf, len_tmp); + len_rem -= len_tmp + 2; + /* don't subtract the "2" because we still need + * to add quotes to the remaining string */ + len_abuf -= len_tmp; + } + len_buf -= len_tmp; + buf += len_tmp; + } + + /* ready to move to the next argument? */ + if ((len_buf == 0) && !require_data) { + arg++; + iter = 0; + len_full = 0; + require_data = true; + encode = false; + } + } while (arg < context->execve.argc); + + /* NOTE: the caller handles the final audit_log_end() call */ + +out: + kfree(buf_head); } static void show_special(struct audit_context *context, int *call_panic) From 8a627afb7a4ffda3d6f87d9bcaa19b839d247f5d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 23 Feb 2016 14:58:52 -0800 Subject: [PATCH 0318/3220] UPSTREAM: x86: fix SMAP in 32-bit environments (cherry picked from commit de9e478b9d49f3a0214310d921450cf5bb4a21e6) In commit 11f1a4b9755f ("x86: reorganize SMAP handling in user space accesses") I changed how the stac/clac instructions were generated around the user space accesses, which then made it possible to do batched accesses efficiently for user string copies etc. However, in doing so, I completely spaced out, and didn't even think about the 32-bit case. And nobody really even seemed to notice, because SMAP doesn't even exist until modern Skylake processors, and you'd have to be crazy to run 32-bit kernels on a modern CPU. Which brings us to Andy Lutomirski. He actually tested the 32-bit kernel on new hardware, and noticed that it doesn't work. My bad. The trivial fix is to add the required uaccess begin/end markers around the raw accesses in . I feel a bit bad about this patch, just because that header file really should be cleaned up to avoid all the duplicated code in it, and this commit just expands on the problem. But this just fixes the bug without any bigger cleanup surgery. Reported-and-tested-by: Andy Lutomirski Signed-off-by: Linus Torvalds Change-Id: Ic044ebfe658a13179984111d062ca3a0b1404110 Signed-off-by: Amit Pundir --- arch/x86/include/asm/uaccess_32.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 2ffc9188cc70..c3724acd0fe4 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -49,20 +49,28 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __put_user_size(*(u8 *)from, (u8 __user *)to, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __put_user_size(*(u16 *)from, (u16 __user *)to, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __put_user_size(*(u32 *)from, (u32 __user *)to, 4, ret, 4); + __uaccess_end(); return ret; case 8: + __uaccess_begin(); __put_user_size(*(u64 *)from, (u64 __user *)to, 8, ret, 8); + __uaccess_end(); return ret; } } @@ -104,13 +112,19 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } @@ -150,13 +164,19 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } @@ -172,13 +192,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to, switch (n) { case 1: + __uaccess_begin(); __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); return ret; case 2: + __uaccess_begin(); __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); return ret; case 4: + __uaccess_begin(); __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); return ret; } } From 5b432907fedfa6413bfe5971068853a786c4be16 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Nov 2015 13:26:19 +0000 Subject: [PATCH 0319/3220] UPSTREAM: arm64: mm: detect bad __create_mapping uses If a caller of __create_mapping provides a PA and VA which have different sub-page offsets, it is not clear which offset they expect to apply to the mapping, and is indicative of a bad caller. In some cases, the region we wish to map may validly have a sub-page offset in the physical and virtual addresses. For example, EFI runtime regions have 4K granularity, yet may be mapped by a 64K page kernel. So long as the physical and virtual offsets are the same, the region will be mapped at the expected VAs. Disallow calls with differing sub-page offsets, and WARN when they are encountered, so that we can detect and fix such cases. Cc: Laura Abbott Acked-by: Ard Biesheuvel Acked-by: Catalin Marinas Reviewed-by: Steve Capper Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Bug: 30369029 Patchset: __create_mapping-fixes (cherry picked from commit cc5d2b3b95cdbb3fed4e38e667d17b9ac7250f7a) Signed-off-by: Jeff Vander Stoep Change-Id: I114a1265b10ff76daff385728d2125e618c313a1 --- arch/arm64/mm/mmu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9a8dc4a79d8b..52f03bb07429 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -251,6 +251,13 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, { unsigned long addr, length, end, next; + /* + * If the virtual and physical address don't have the same offset + * within a page, we cannot map the region as the caller expects. + */ + if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) + return; + addr = virt & PAGE_MASK; length = PAGE_ALIGN(size + (virt & ~PAGE_MASK)); From ab8158d22085047097653a799fc9bae6e9920edf Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Nov 2015 13:26:20 +0000 Subject: [PATCH 0320/3220] UPSTREAM: arm64: mm: allow sections for unaligned bases Callees of __create_mapping may decide to create section mappings if sufficient low bits of the physical and virtual addresses they were passed are zero. While __create_mapping rounds the virtual base address down, it does not similarly round the physical base address down, and hence non-zero bits in the physical address can prevent use of a section mapping, even where a whole next-level table would be used instead. Round down the physical base address in __create_mapping to enable all callees to always create section mappings when such a mapping is possible. Cc: Laura Abbott Acked-by: Ard Biesheuvel Acked-by: Catalin Marinas Reviewed-by: Steve Capper Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Bug: 30369029 Patchset: __create_mapping-fixes (cherry picked from commit 9c4e08a3022b6df90d31ef4007291faabfce5431) Signed-off-by: Jeff Vander Stoep Change-Id: Ic447350efdba3cd9f9e101c72183e04e39dd28d2 --- arch/arm64/mm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 52f03bb07429..4c381f0e77a7 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -258,6 +258,7 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) return; + phys &= PAGE_MASK; addr = virt & PAGE_MASK; length = PAGE_ALIGN(size + (virt & ~PAGE_MASK)); From 7152164d7b5fdfe68423ca1568fc227e92d3d6f7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Oct 2015 18:56:19 +0000 Subject: [PATCH 0321/3220] UPSTREAM: arm64: pgtable: implement pte_accessible() This patch implements the pte_accessible() macro, which can be used to test whether or not a given pte is a candidate for allocation in the TLB. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit 76c714be0e5e60c935a53b31be58939510ba1d0f) Signed-off-by: Jeff Vander Stoep Change-Id: I249e2d15665870149dd17d1cdb3850008f5a56fd --- arch/arm64/include/asm/pgtable.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 63f52b55defe..824416a219e7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -167,6 +167,16 @@ extern struct page *empty_zero_page; ((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID) +#define pte_valid_young(pte) \ + ((pte_val(pte) & (PTE_VALID | PTE_AF)) == (PTE_VALID | PTE_AF)) + +/* + * Could the pte be present in the TLB? We must check mm_tlb_flush_pending + * so that we don't erroneously return false for pages that have been + * remapped as PROT_NONE but are yet to be flushed from the TLB. + */ +#define pte_accessible(mm, pte) \ + (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid_young(pte)) static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { From 4277d7534ed7af7f103e20504dff3aead7799c99 Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Fri, 2 Sep 2016 09:37:24 +0100 Subject: [PATCH 0322/3220] UPSTREAM: brcmfmac: avoid potential stack overflow in brcmf_cfg80211_start_ap() commit ded89912156b1a47d940a0c954c43afbabd0c42c upstream User-space can choose to omit NL80211_ATTR_SSID and only provide raw IE TLV data. When doing so it can provide SSID IE with length exceeding the allowed size. The driver further processes this IE copying it into a local variable without checking the length. Hence stack can be corrupted and used as exploit. Cc: stable@vger.kernel.org # v4.4, v4.1 Reported-by: Daxing Guo Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c index deb5f78dcacc..786fff3d7466 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c @@ -4099,7 +4099,7 @@ brcmf_cfg80211_start_ap(struct wiphy *wiphy, struct net_device *ndev, (u8 *)&settings->beacon.head[ie_offset], settings->beacon.head_len - ie_offset, WLAN_EID_SSID); - if (!ssid_ie) + if (!ssid_ie || ssid_ie->len > IEEE80211_MAX_SSID_LEN) return -EINVAL; memcpy(ssid_le.SSID, ssid_ie->data, ssid_ie->len); From 1a8cf46bbfcc2eac1db8cfcc9c3b0596b9ee1d66 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 14 Apr 2016 17:01:17 +0200 Subject: [PATCH 0323/3220] UPSTREAM: usb: gadget: f_fs: Fix use-after-free (cherry picked from commit 38740a5b87d53ceb89eb2c970150f6e94e00373a) When using asynchronous read or write operations on the USB endpoints the issuer of the IO request is notified by calling the ki_complete() callback of the submitted kiocb when the URB has been completed. Calling this ki_complete() callback will free kiocb. Make sure that the structure is no longer accessed beyond that point, otherwise undefined behaviour might occur. Fixes: 2e4c7553cd6f ("usb: gadget: f_fs: add aio support") Cc: # v3.15+ Signed-off-by: Lars-Peter Clausen Signed-off-by: Felipe Balbi Change-Id: I3c7b643f6440c4fb6160a57c1058523030b46a6c Bug: 30950866 --- drivers/usb/gadget/function/f_fs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index cf43e9e18368..79d895c2dd71 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -646,6 +646,7 @@ static void ffs_user_copy_worker(struct work_struct *work) work); int ret = io_data->req->status ? io_data->req->status : io_data->req->actual; + bool kiocb_has_eventfd = io_data->kiocb->ki_flags & IOCB_EVENTFD; if (io_data->read && ret > 0) { use_mm(io_data->mm); @@ -657,13 +658,11 @@ static void ffs_user_copy_worker(struct work_struct *work) io_data->kiocb->ki_complete(io_data->kiocb, ret, ret); - if (io_data->ffs->ffs_eventfd && - !(io_data->kiocb->ki_flags & IOCB_EVENTFD)) + if (io_data->ffs->ffs_eventfd && !kiocb_has_eventfd) eventfd_signal(io_data->ffs->ffs_eventfd, 1); usb_ep_free_request(io_data->ep, io_data->req); - io_data->kiocb->private = NULL; if (io_data->read) kfree(io_data->to_free); kfree(io_data->buf); From 6b51f3d20c52982bd71028509ebb3a4a2389850b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 20 Nov 2015 17:59:10 +0800 Subject: [PATCH 0324/3220] UPSTREAM: arm64: add __init/__initdata section marker to some functions/variables These functions/variables are not needed after booting, so mark them as __init or __initdata. Signed-off-by: Jisheng Zhang Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit a7c61a3452d39078919f0e1f493ff966fb64f0db) Signed-off-by: Jeff Vander Stoep Change-Id: I50a3362e186750e139d2440d2c1e1d49ace896e1 --- arch/arm64/kernel/armv8_deprecated.c | 6 +++--- arch/arm64/kernel/cpufeature.c | 9 +++++---- arch/arm64/kernel/fpsimd.c | 2 +- arch/arm64/mm/dma-mapping.c | 4 ++-- arch/arm64/mm/init.c | 6 +++--- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 937f5e58a4d3..3e01207917b1 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -62,7 +62,7 @@ struct insn_emulation { }; static LIST_HEAD(insn_emulation); -static int nr_insn_emulated; +static int nr_insn_emulated __initdata; static DEFINE_RAW_SPINLOCK(insn_emulation_lock); static void register_emulation_hooks(struct insn_emulation_ops *ops) @@ -173,7 +173,7 @@ static int update_insn_emulation_mode(struct insn_emulation *insn, return ret; } -static void register_insn_emulation(struct insn_emulation_ops *ops) +static void __init register_insn_emulation(struct insn_emulation_ops *ops) { unsigned long flags; struct insn_emulation *insn; @@ -237,7 +237,7 @@ static struct ctl_table ctl_abi[] = { { } }; -static void register_insn_emulation_sysctl(struct ctl_table *table) +static void __init register_insn_emulation_sysctl(struct ctl_table *table) { unsigned long flags; int i = 0; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 0669c63281ea..5c90aa490a2b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -684,7 +684,7 @@ static const struct arm64_cpu_capabilities arm64_hwcaps[] = { {}, }; -static void cap_set_hwcap(const struct arm64_cpu_capabilities *cap) +static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap) { switch (cap->hwcap_type) { case CAP_HWCAP: @@ -729,7 +729,7 @@ static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities * return rc; } -static void setup_cpu_hwcaps(void) +static void __init setup_cpu_hwcaps(void) { int i; const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps; @@ -758,7 +758,8 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, * Run through the enabled capabilities and enable() it on all active * CPUs */ -static void enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) +static void __init +enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) { int i; @@ -897,7 +898,7 @@ static inline void set_sys_caps_initialised(void) #endif /* CONFIG_HOTPLUG_CPU */ -static void setup_feature_capabilities(void) +static void __init setup_feature_capabilities(void) { update_cpu_capabilities(arm64_features, "detected feature:"); enable_cpu_capabilities(arm64_features); diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 4c46c54a3ad7..acc1afd5c749 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -289,7 +289,7 @@ static struct notifier_block fpsimd_cpu_pm_notifier_block = { .notifier_call = fpsimd_cpu_pm_notifier, }; -static void fpsimd_pm_init(void) +static void __init fpsimd_pm_init(void) { cpu_pm_register_notifier(&fpsimd_cpu_pm_notifier_block); } diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 6ad554b01e95..7b71b7c47adb 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -40,7 +40,7 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot, static struct gen_pool *atomic_pool; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; +static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; static int __init early_coherent_pool(char *p) { @@ -896,7 +896,7 @@ static int __iommu_attach_notifier(struct notifier_block *nb, return 0; } -static int register_iommu_dma_ops_notifier(struct bus_type *bus) +static int __init register_iommu_dma_ops_notifier(struct bus_type *bus) { struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL); int ret; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 325f89ff897e..d43715a14383 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -71,7 +71,7 @@ early_param("initrd", early_initrd); * currently assumes that for memory starting above 4G, 32-bit devices will * use a DMA offset. */ -static phys_addr_t max_zone_dma_phys(void) +static phys_addr_t __init max_zone_dma_phys(void) { phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32); return min(offset + (1ULL << 32), memblock_end_of_DRAM()); @@ -128,11 +128,11 @@ EXPORT_SYMBOL(pfn_valid); #endif #ifndef CONFIG_SPARSEMEM -static void arm64_memory_present(void) +static void __init arm64_memory_present(void) { } #else -static void arm64_memory_present(void) +static void __init arm64_memory_present(void) { struct memblock_region *reg; From f65f45205e5bae2464be051cb6deca598f62f3e8 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 2 Dec 2015 14:00:10 +0000 Subject: [PATCH 0325/3220] UPSTREAM: arm64: fix COMPAT_SHMLBA definition for large pages ARM glibc uses (4 * __getpagesize()) for SHMLBA, which is correct for 4KB pages and works fine for 64KB pages, but the kernel uses a hardcoded 16KB that is too small for 64KB page based kernels. This changes the definition to what user space sees when using 64KB pages. Acked-by: Arnd Bergmann Signed-off-by: Yury Norov Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit b9b7aebb42d1b1392f3111de61136bb6cf3aae3f) Signed-off-by: Jeff Vander Stoep Change-Id: I698814d005a28c7fe3963cded9756f88660d4aa0 --- arch/arm64/include/asm/shmparam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/shmparam.h b/arch/arm64/include/asm/shmparam.h index 4df608a8459e..e368a55ebd22 100644 --- a/arch/arm64/include/asm/shmparam.h +++ b/arch/arm64/include/asm/shmparam.h @@ -21,7 +21,7 @@ * alignment value. Since we don't have aliasing D-caches, the rest of * the time we can safely use PAGE_SIZE. */ -#define COMPAT_SHMLBA 0x4000 +#define COMPAT_SHMLBA (4 * PAGE_SIZE) #include From 9b49bf531ba02bcdbc6d4182a179a49dd30349d9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 23 Nov 2015 15:12:59 +0000 Subject: [PATCH 0326/3220] UPSTREAM: arm64: enable HAVE_IRQ_TIME_ACCOUNTING arm64 relies on the arm_arch_timer for sched_clock, so we can select HAVE_IRQ_TIME_ACCOUNTING and have the core sched-clock code enable the feature at runtime based on the rate. Reported-by: Mario Smarduch Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit 24da208db32ee1e4757ceaba898c47add8e5361e) Signed-off-by: Jeff Vander Stoep Change-Id: I849e010459dbbde0ac0d44d665dadcea9f8bf12d --- Documentation/features/time/irq-time-acct/arch-support.txt | 2 +- arch/arm64/Kconfig | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt index e63316239938..4199ffecc0ff 100644 --- a/Documentation/features/time/irq-time-acct/arch-support.txt +++ b/Documentation/features/time/irq-time-acct/arch-support.txt @@ -9,7 +9,7 @@ | alpha: | .. | | arc: | TODO | | arm: | ok | - | arm64: | .. | + | arm64: | ok | | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3ad9d9ea374b..92f234a5579b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -73,6 +73,7 @@ config ARM64 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_GENERIC_DMA_COHERENT select HAVE_HW_BREAKPOINT if PERF_EVENTS + select HAVE_IRQ_TIME_ACCOUNTING select HAVE_MEMBLOCK select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS From 5bb08d2a552a3a7ba1e84527ea7987018fcc2f69 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 19 Nov 2015 17:48:31 +0000 Subject: [PATCH 0327/3220] UPSTREAM: arm64: spinlock: serialise spin_unlock_wait against concurrent lockers Boqun Feng reported a rather nasty ordering issue with spin_unlock_wait on architectures implementing spin_lock with LL/SC sequences and acquire semantics: | CPU 1 CPU 2 CPU 3 | ================== ==================== ============== | spin_unlock(&lock); | spin_lock(&lock): | r1 = *lock; // r1 == 0; | o = READ_ONCE(object); // reordered here | object = NULL; | smp_mb(); | spin_unlock_wait(&lock); | *lock = 1; | smp_mb(); | o->dead = true; | if (o) // true | BUG_ON(o->dead); // true!! The crux of the problem is that spin_unlock_wait(&lock) can return on CPU 1 whilst CPU 2 is in the process of taking the lock. This can be resolved by upgrading spin_unlock_wait to a LOCK operation, forcing it to serialise against a concurrent locker and giving it acquire semantics in the process (although it is not at all clear whether this is needed - different callers seem to assume different things about the barrier semantics and architectures are similarly disjoint in their implementations of the macro). This patch implements spin_unlock_wait using an LL/SC sequence with acquire semantics on arm64. For v8.1 systems with the LSE atomics, the exclusive writeback is omitted, since the spin_lock operation is indivisible and no intermediate state can be observed. Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit d86b8da04dfa4771a68bdbad6c424d40f22f0d14) Signed-off-by: Jeff Vander Stoep Change-Id: I27d88458c99dfd475d0326bd1d408ec3e575a7dd --- arch/arm64/include/asm/spinlock.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index c85e96d174a5..fc9682bfe002 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -26,9 +26,28 @@ * The memory barriers are implicit with the load-acquire and store-release * instructions. */ +static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) +{ + unsigned int tmp; + arch_spinlock_t lockval; -#define arch_spin_unlock_wait(lock) \ - do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0) + asm volatile( +" sevl\n" +"1: wfe\n" +"2: ldaxr %w0, %2\n" +" eor %w1, %w0, %w0, ror #16\n" +" cbnz %w1, 1b\n" + ARM64_LSE_ATOMIC_INSN( + /* LL/SC */ +" stxr %w1, %w0, %2\n" +" cbnz %w1, 2b\n", /* Serialise against any concurrent lockers */ + /* LSE atomics */ +" nop\n" +" nop\n") + : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) + : + : "memory"); +} #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) From b42c14a531b4b887ffca279ad12eb844a99408ae Mon Sep 17 00:00:00 2001 From: Li Bin Date: Fri, 4 Dec 2015 11:38:39 +0800 Subject: [PATCH 0328/3220] UPSTREAM: arm64: ftrace: stop using kstop_machine to enable/disable tracing For ftrace on arm64, kstop_machine which is hugely disruptive to a running system is not needed to convert nops to ftrace calls or back, because that to be modified instrucions, that NOP, B or BL, are all safe instructions which called "concurrent modification and execution of instructions", that can be executed by one thread of execution as they are being modified by another thread of execution without requiring explicit synchronization. Signed-off-by: Li Bin Reviewed-by: Steven Rostedt Signed-off-by: Will Deacon Bug: 30369029 Patchset: arm64-ftrace (cherry picked from commit 81a6a146e88eca5d6726569779778d61489d85aa) Signed-off-by: Jeff Vander Stoep Change-Id: I54e2c0d49bd68f9547bd9f0da8b7520e02bb0714 --- arch/arm64/kernel/ftrace.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index c851be795080..9669b331a23b 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -93,6 +93,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, return ftrace_modify_code(pc, old, new, true); } +void arch_ftrace_update_code(int command) +{ + ftrace_modify_all_code(command); +} + int __init ftrace_dyn_arch_init(void) { return 0; From 2c21fce80b6fab7304d2081f2b1157033e30ad50 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Fri, 4 Dec 2015 11:38:40 +0800 Subject: [PATCH 0329/3220] UPSTREAM: arm64: ftrace: fix the comments for ftrace_modify_code There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Suggested-by: Steven Rostedt Signed-off-by: Li Bin Signed-off-by: Will Deacon Bug: 30369029 Patchset: arm64-ftrace (cherry picked from commit 004ab584e028093996cf5b8e220b8bc50c5111cf) Signed-off-by: Jeff Vander Stoep Change-Id: I69df7eddbf9e17031920b950312399dc4d36c09e --- arch/arm64/kernel/ftrace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 9669b331a23b..8f7005bc35bd 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -29,12 +29,11 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, /* * Note: - * Due to modules and __init, code can disappear and change, - * we need to protect against faulting as well as code changing. - * We do this by aarch64_insn_*() which use the probe_kernel_*(). - * - * No lock is held here because all the modifications are run - * through stop_machine(). + * We are paranoid about modifying text, as if a bug were to happen, it + * could cause us to read or write to someplace that could cause harm. + * Carefully read and modify the code with aarch64_insn_*() which uses + * probe_kernel_*(), and make sure what we read is what we expected it + * to be before modifying it. */ if (validate) { if (aarch64_insn_read((void *)pc, &replaced)) From 611a16d0e5fe5f9f77c62551d94770e2cc825eb5 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 4 Dec 2015 12:42:29 +0000 Subject: [PATCH 0330/3220] UPSTREAM: arm64: Add trace_hardirqs_off annotation in ret_to_user When a kernel is built with CONFIG_TRACE_IRQFLAGS the following warning is produced when entering userspace for the first time: WARNING: at /work/Linux/linux-2.6-aarch64/kernel/locking/lockdep.c:3519 Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc3+ #639 Hardware name: Juno (DT) task: ffffffc9768a0000 ti: ffffffc9768a8000 task.ti: ffffffc9768a8000 PC is at check_flags.part.22+0x19c/0x1a8 LR is at check_flags.part.22+0x19c/0x1a8 pc : [] lr : [] pstate: 600001c5 sp : ffffffc9768abe10 x29: ffffffc9768abe10 x28: ffffffc9768a8000 x27: 0000000000000000 x26: 0000000000000001 x25: 00000000000000a6 x24: ffffffc00064be6c x23: ffffffc0009f249e x22: ffffffc9768a0000 x21: ffffffc97fea5480 x20: 00000000000001c0 x19: ffffffc00169a000 x18: 0000005558cc7b58 x17: 0000007fb78e3180 x16: 0000005558d2e238 x15: ffffffffffffffff x14: 0ffffffffffffffd x13: 0000000000000008 x12: 0101010101010101 x11: 7f7f7f7f7f7f7f7f x10: fefefefefefeff63 x9 : 7f7f7f7f7f7f7f7f x8 : 6e655f7371726964 x7 : 0000000000000001 x6 : ffffffc0001079c4 x5 : 0000000000000000 x4 : 0000000000000001 x3 : ffffffc001698438 x2 : 0000000000000000 x1 : ffffffc9768a0000 x0 : 000000000000002e Call trace: [] check_flags.part.22+0x19c/0x1a8 [] lock_is_held+0x80/0x98 [] __schedule+0x404/0x730 [] schedule+0x44/0xb8 [] ret_to_user+0x0/0x24 possible reason: unannotated irqs-off. irq event stamp: 502169 hardirqs last enabled at (502169): [] el0_irq_naked+0x1c/0x24 hardirqs last disabled at (502167): [] __do_softirq+0x17c/0x298 softirqs last enabled at (502168): [] __do_softirq+0x1fc/0x298 softirqs last disabled at (502143): [] irq_exit+0xa0/0xf0 This happens because we disable interrupts in ret_to_user before calling schedule() in work_resched. This patch adds the necessary trace_hardirqs_off annotation. Signed-off-by: Catalin Marinas Reported-by: Mark Rutland Cc: Will Deacon Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit db3899a6477a4dccd26cbfb7f408b6be2cc068e0) Signed-off-by: Jeff Vander Stoep Change-Id: I868c6c64c548f12156467959e2b8df09caae282f --- arch/arm64/kernel/entry.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7ed3d75f6304..e5b25389c48f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -634,6 +634,9 @@ work_pending: bl do_notify_resume b ret_to_user work_resched: +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off // the IRQs are off here, inform the tracing code +#endif bl schedule /* From b7547d55d7a578f62053c86f475eabd68282e6aa Mon Sep 17 00:00:00 2001 From: Jungseok Lee Date: Fri, 4 Dec 2015 11:02:25 +0000 Subject: [PATCH 0331/3220] UPSTREAM: arm64: Store struct thread_info in sp_el0 There is need for figuring out how to manage struct thread_info data when IRQ stack is introduced. struct thread_info information should be copied to IRQ stack under the current thread_info calculation logic whenever context switching is invoked. This is too expensive to keep supporting the approach. Instead, this patch pays attention to sp_el0 which is an unused scratch register in EL1 context. sp_el0 utilization not only simplifies the management, but also prevents text section size from being increased largely due to static allocated IRQ stack as removing masking operation using THREAD_SIZE in many places. Reviewed-by: Catalin Marinas Signed-off-by: Jungseok Lee Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 6cdf9c7ca687e01840d0215437620a20263012fc) Signed-off-by: Jeff Vander Stoep Change-Id: I53c9f44a0772b8649f302a65a7a6519d8eebcb91 --- arch/arm64/include/asm/thread_info.h | 10 ++++++++-- arch/arm64/kernel/entry.S | 15 ++++++++++++--- arch/arm64/kernel/head.S | 5 +++++ arch/arm64/kernel/sleep.S | 3 +++ 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 90c7ff233735..abd64bd1f6d9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -73,10 +73,16 @@ register unsigned long current_stack_pointer asm ("sp"); */ static inline struct thread_info *current_thread_info(void) __attribute_const__; +/* + * struct thread_info can be accessed directly via sp_el0. + */ static inline struct thread_info *current_thread_info(void) { - return (struct thread_info *) - (current_stack_pointer & ~(THREAD_SIZE - 1)); + unsigned long sp_el0; + + asm ("mrs %0, sp_el0" : "=r" (sp_el0)); + + return (struct thread_info *)sp_el0; } #define thread_saved_pc(tsk) \ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e5b25389c48f..245fa6837880 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -88,7 +88,8 @@ .if \el == 0 mrs x21, sp_el0 - get_thread_info tsk // Ensure MDSCR_EL1.SS is clear, + mov tsk, sp + and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug disable_step_tsk x19, x20 // exceptions when scheduling. .else @@ -107,6 +108,13 @@ str x21, [sp, #S_SYSCALLNO] .endif + /* + * Set sp_el0 to current thread_info. + */ + .if \el == 0 + msr sp_el0, tsk + .endif + /* * Registers that may be useful after this macro is invoked: * @@ -164,8 +172,7 @@ alternative_endif .endm .macro get_thread_info, rd - mov \rd, sp - and \rd, \rd, #~(THREAD_SIZE - 1) // top of stack + mrs \rd, sp_el0 .endm /* @@ -599,6 +606,8 @@ ENTRY(cpu_switch_to) ldp x29, x9, [x8], #16 ldr lr, [x8] mov sp, x9 + and x9, x9, #~(THREAD_SIZE - 1) + msr sp_el0, x9 ret ENDPROC(cpu_switch_to) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 23cfc08fc8ba..b363f340f2c7 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -424,6 +424,9 @@ __mmap_switched: b 1b 2: adr_l sp, initial_sp, x4 + mov x4, sp + and x4, x4, #~(THREAD_SIZE - 1) + msr sp_el0, x4 // Save thread_info str_l x21, __fdt_pointer, x5 // Save FDT pointer str_l x24, memstart_addr, x6 // Save PHYS_OFFSET mov x29, #0 @@ -606,6 +609,8 @@ ENDPROC(secondary_startup) ENTRY(__secondary_switched) ldr x0, [x21] // get secondary_data.stack mov sp, x0 + and x0, x0, #~(THREAD_SIZE - 1) + msr sp_el0, x0 // save thread_info mov x29, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index f586f7c875e2..e33fe33876ab 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -173,6 +173,9 @@ ENTRY(cpu_resume) /* load physical address of identity map page table in x1 */ adrp x1, idmap_pg_dir mov sp, x2 + /* save thread_info */ + and x2, x2, #~(THREAD_SIZE - 1) + msr sp_el0, x2 /* * cpu_do_resume expects x0 to contain context physical address * pointer and x1 to contain physical address of 1:1 page tables From 52494a8bb8a274cb31bb241bb92bd2617855f9f8 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 4 Dec 2015 11:02:26 +0000 Subject: [PATCH 0332/3220] UPSTREAM: arm64: Modify stack trace and dump for use with irq_stack This patch allows unwind_frame() to traverse from interrupt stack to task stack correctly. It requires data from a dummy stack frame, created during irq_stack_entry(), added by a later patch. A similar approach is taken to modify dump_backtrace(), which expects to find struct pt_regs underneath any call to functions marked __exception. When on an irq_stack, the struct pt_regs is stored on the old task stack, the location of which is stored in the dummy stack frame. Reviewed-by: Catalin Marinas Signed-off-by: AKASHI Takahiro [james.morse: merged two patches, reworked for per_cpu irq_stacks, and no alignment guarantees, added irq_stack definitions] Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 132cd887b5c54758d04bf25c52fa48f45e843a30) Signed-off-by: Jeff Vander Stoep Change-Id: I60b29291620a71ab7b6564730299d29f41ceb199 --- arch/arm64/include/asm/irq.h | 32 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/irq.c | 3 +++ arch/arm64/kernel/stacktrace.c | 29 +++++++++++++++++++++++++++-- arch/arm64/kernel/traps.c | 14 +++++++++++++- 4 files changed, 75 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 8e8d30684392..e2f3f135a3bc 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -1,10 +1,32 @@ #ifndef __ASM_IRQ_H #define __ASM_IRQ_H +#define IRQ_STACK_SIZE THREAD_SIZE +#define IRQ_STACK_START_SP THREAD_START_SP + +#ifndef __ASSEMBLER__ + +#include + #include +#include struct pt_regs; +DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); + +/* + * The highest address on the stack, and the first to be used. Used to + * find the dummy-stack frame put down by el?_irq() in entry.S. + */ +#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) + +/* + * The offset from irq_stack_ptr where entry.S will store the original + * stack pointer. Used by unwind_frame() and dump_backtrace(). + */ +#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x10)); + extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); static inline int nr_legacy_irqs(void) @@ -12,4 +34,14 @@ static inline int nr_legacy_irqs(void) return 0; } +static inline bool on_irq_stack(unsigned long sp, int cpu) +{ + /* variable names the same as kernel/stacktrace.c */ + unsigned long low = (unsigned long)per_cpu(irq_stack, cpu); + unsigned long high = low + IRQ_STACK_START_SP; + + return (low <= sp && sp <= high); +} + +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 9f17ec071ee0..1e3cef578e21 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -30,6 +30,9 @@ unsigned long irq_err_count; +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned */ +DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); + int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index ccb6078ed9f2..b947eeffa5b2 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -20,6 +20,7 @@ #include #include +#include #include /* @@ -39,17 +40,41 @@ int notrace unwind_frame(struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; + unsigned long irq_stack_ptr; + + /* + * Use raw_smp_processor_id() to avoid false-positives from + * CONFIG_DEBUG_PREEMPT. get_wchan() calls unwind_frame() on sleeping + * task stacks, we can be pre-empted in this case, so + * {raw_,}smp_processor_id() may give us the wrong value. Sleeping + * tasks can't ever be on an interrupt stack, so regardless of cpu, + * the checks will always fail. + */ + irq_stack_ptr = IRQ_STACK_PTR(raw_smp_processor_id()); low = frame->sp; - high = ALIGN(low, THREAD_SIZE); + /* irq stacks are not THREAD_SIZE aligned */ + if (on_irq_stack(frame->sp, raw_smp_processor_id())) + high = irq_stack_ptr; + else + high = ALIGN(low, THREAD_SIZE) - 0x20; - if (fp < low || fp > high - 0x18 || fp & 0xf) + if (fp < low || fp > high || fp & 0xf) return -EINVAL; frame->sp = fp + 0x10; frame->fp = *(unsigned long *)(fp); frame->pc = *(unsigned long *)(fp + 8); + /* + * Check whether we are going to walk through from interrupt stack + * to task stack. + * If we reach the end of the stack - and its an interrupt stack, + * read the original task stack pointer from the dummy frame. + */ + if (frame->sp == irq_stack_ptr) + frame->sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + return 0; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index e9b9b5364393..8a0084541f84 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -146,6 +146,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; + unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); @@ -180,9 +181,20 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (ret < 0) break; stack = frame.sp; - if (in_exception_text(where)) + if (in_exception_text(where)) { + /* + * If we switched to the irq_stack before calling this + * exception handler, then the pt_regs will be on the + * task stack. The easiest way to tell is if the large + * pt_regs would overlap with the end of the irq_stack. + */ + if (stack < irq_stack_ptr && + (stack + sizeof(struct pt_regs)) > irq_stack_ptr) + stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + dump_mem("", "Exception stack", stack, stack + sizeof(struct pt_regs), false); + } } } From d8c228e9513c852cd1c6388a8e011eec9120435c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 4 Dec 2015 11:02:27 +0000 Subject: [PATCH 0333/3220] UPSTREAM: arm64: Add do_softirq_own_stack() and enable irq_stacks entry.S is modified to switch to the per_cpu irq_stack during el{0,1}_irq. irq_count is used to detect recursive interrupts on the irq_stack, it is updated late by do_softirq_own_stack(), when called on the irq_stack, before __do_softirq() re-enables interrupts to process softirqs. do_softirq_own_stack() is added by this patch, but does not yet switch stack. This patch adds the dummy stack frame and data needed by the previous stack tracing patches. Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 8e23dacd12a48e58125b84c817da50850b73280a) Signed-off-by: Jeff Vander Stoep Change-Id: I9f79437af3da0398cb12e7afd1aa9f473f59b2e6 --- arch/arm64/include/asm/irq.h | 2 ++ arch/arm64/kernel/entry.S | 42 ++++++++++++++++++++++++++++++++++-- arch/arm64/kernel/irq.c | 38 +++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index e2f3f135a3bc..fa2a8d0e4792 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -11,6 +11,8 @@ #include #include +#define __ARCH_HAS_DO_SOFTIRQ + struct pt_regs; DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 245fa6837880..8f7e737949fe 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -175,6 +176,42 @@ alternative_endif mrs \rd, sp_el0 .endm + .macro irq_stack_entry, dummy_lr + mov x19, sp // preserve the original sp + + adr_l x25, irq_stack + mrs x26, tpidr_el1 + add x25, x25, x26 + + /* + * Check the lowest address on irq_stack for the irq_count value, + * incremented by do_softirq_own_stack if we have re-enabled irqs + * while on the irq_stack. + */ + ldr x26, [x25] + cbnz x26, 9998f // recursive use? + + /* switch to the irq stack */ + mov x26, #IRQ_STACK_START_SP + add x26, x25, x26 + mov sp, x26 + + /* Add a dummy stack frame */ + stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame + mov x29, sp + stp xzr, x19, [sp, #-16]! + +9998: + .endm + + /* + * x19 should be preserved between irq_stack_entry and + * irq_stack_exit. + */ + .macro irq_stack_exit + mov sp, x19 + .endm + /* * These are the registers used in the syscall handler, and allow us to * have in theory up to 7 arguments to a function - x0 to x6. @@ -190,10 +227,11 @@ tsk .req x28 // current thread_info * Interrupt handling. */ .macro irq_handler - adrp x1, handle_arch_irq - ldr x1, [x1, #:lo12:handle_arch_irq] + ldr_l x1, handle_arch_irq mov x0, sp + irq_stack_entry x22 blr x1 + irq_stack_exit .endm .text diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 1e3cef578e21..ff7ebb710e51 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -25,14 +25,24 @@ #include #include #include +#include #include #include unsigned long irq_err_count; -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned */ +/* + * irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. + * irq_stack[0] is used as irq_count, a non-zero value indicates the stack + * is in use, and el?_irq() shouldn't switch to it. This is used to detect + * recursive use of the irq_stack, it is lazily updated by + * do_softirq_own_stack(), which is called on the irq_stack, before + * re-enabling interrupts to process softirqs. + */ DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); +#define IRQ_COUNT() (*per_cpu(irq_stack, smp_processor_id())) + int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); @@ -56,3 +66,29 @@ void __init init_IRQ(void) if (!handle_arch_irq) panic("No interrupt controller found."); } + +/* + * do_softirq_own_stack() is called from irq_exit() before __do_softirq() + * re-enables interrupts, at which point we may re-enter el?_irq(). We + * increase irq_count here so that el1_irq() knows that it is already on the + * irq stack. + * + * Called with interrupts disabled, so we don't worry about moving cpu, or + * being interrupted while modifying irq_count. + * + * This function doesn't actually switch stack. + */ +void do_softirq_own_stack(void) +{ + int cpu = smp_processor_id(); + + WARN_ON_ONCE(!irqs_disabled()); + + if (on_irq_stack(current_stack_pointer, cpu)) { + IRQ_COUNT()++; + __do_softirq(); + IRQ_COUNT()--; + } else { + __do_softirq(); + } +} From b485dccd9eee43b0739198f53459c6a0088e93a5 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Dec 2015 13:58:42 +0000 Subject: [PATCH 0334/3220] UPSTREAM: arm64: irq: fix walking from irq stack to task stack Running with CONFIG_DEBUG_SPINLOCK=y can trigger a BUG with the new IRQ stack code: BUG: spinlock lockup suspected on CPU#1 This is due to the IRQ_STACK_TO_TASK_STACK macro incorrectly retrieving the task stack pointer stashed at the top of the IRQ stack. Sayeth James: | Yup, this is what is happening. Its an off-by-one due to broken | thinking about how the stack works. My broken thinking was: | | > top ------------ | > | dummy_lr | <- irq_stack_ptr | > ------------ | > | x29 | | > ------------ | > | x19 | <- irq_stack_ptr - 0x10 | > ------------ | > | xzr | | > ------------ | | But the stack-pointer is decreased before use. So it actually looks | like this: | | > ------------ | > | | <- irq_stack_ptr | > top ------------ | > | dummy_lr | | > ------------ | > | x29 | <- irq_stack_ptr - 0x10 | > ------------ | > | x19 | | > ------------ | > | xzr | <- irq_stack_ptr - 0x20 | > ------------ | | The value being used as the original stack is x29, which in all the | tests is sp but without the current frames data, hence there are no | missing frames in the output. | | Jungseok Lee picked it up with a 32bit user space because aarch32 | can't use x29, so it remains 0 forever. The fix he posted is correct. This patch fixes the macro and adds some of this wisdom to a comment, so that the layout of the IRQ stack is well understood. Cc: James Morse Reported-by: Jungseok Lee Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 7596abf2e5661d52c4f414f37addeed54e098880) Signed-off-by: Jeff Vander Stoep Change-Id: Ic65c0d0d90a30a5caf8b3791d1e856400bd2b5f5 --- arch/arm64/include/asm/irq.h | 20 ++++++++++++++++++-- arch/arm64/kernel/entry.S | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index fa2a8d0e4792..877c7e358384 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -19,7 +19,23 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); /* * The highest address on the stack, and the first to be used. Used to - * find the dummy-stack frame put down by el?_irq() in entry.S. + * find the dummy-stack frame put down by el?_irq() in entry.S, which + * is structured as follows: + * + * ------------ + * | | <- irq_stack_ptr + * top ------------ + * | elr_el1 | + * ------------ + * | x29 | <- irq_stack_ptr - 0x10 + * ------------ + * | xzr | + * ------------ + * | x19 | <- irq_stack_ptr - 0x20 + * ------------ + * + * where x19 holds a copy of the task stack pointer. + * */ #define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) @@ -27,7 +43,7 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * The offset from irq_stack_ptr where entry.S will store the original * stack pointer. Used by unwind_frame() and dump_backtrace(). */ -#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x10)); +#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x20)); extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8f7e737949fe..be7ec544b540 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -199,7 +199,7 @@ alternative_endif /* Add a dummy stack frame */ stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame mov x29, sp - stp xzr, x19, [sp, #-16]! + stp x19, xzr, [sp, #-16]! 9998: .endm From 722e6114950d4dfdf890c045461932c91a4209d7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 10 Dec 2015 10:22:39 +0000 Subject: [PATCH 0335/3220] UPSTREAM: arm64: Add this_cpu_ptr() assembler macro for use in entry.S irq_stack is a per_cpu variable, that needs to be access from entry.S. Use an assembler macro instead of the unreadable details. Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit aa4d5d3cbc258c355151a3903211b27359390ec5) Signed-off-by: Jeff Vander Stoep Change-Id: I2ccc11f442c0303c62e1c3e0a05a088958c922b8 --- arch/arm64/include/asm/assembler.h | 11 +++++++++++ arch/arm64/kernel/entry.S | 4 +--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 12eff928ef8b..bb7b72734c24 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -193,6 +193,17 @@ lr .req x30 // link register str \src, [\tmp, :lo12:\sym] .endm + /* + * @sym: The name of the per-cpu variable + * @reg: Result of per_cpu(sym, smp_processor_id()) + * @tmp: scratch register + */ + .macro this_cpu_ptr, sym, reg, tmp + adr_l \reg, \sym + mrs \tmp, tpidr_el1 + add \reg, \reg, \tmp + .endm + /* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index be7ec544b540..e394f8c9595a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -179,9 +179,7 @@ alternative_endif .macro irq_stack_entry, dummy_lr mov x19, sp // preserve the original sp - adr_l x25, irq_stack - mrs x26, tpidr_el1 - add x25, x25, x26 + this_cpu_ptr irq_stack, x25, x26 /* * Check the lowest address on irq_stack for the irq_count value, From 4ba051d5a5476288cb27912c90de10aaf7c7f151 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 10 Dec 2015 10:22:40 +0000 Subject: [PATCH 0336/3220] UPSTREAM: arm64: when walking onto the task stack, check sp & fp are in current->stack When unwind_frame() reaches the bottom of the irq_stack, the last fp points to the original task stack. unwind_frame() uses IRQ_STACK_TO_TASK_STACK() to find the sp value. If either values is wrong, we may end up walking a corrupt stack. Check these values are sane by testing if they are both on the stack pointed to by current->stack. Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 1ffe199b1c9b72a8e752a9ae2a7af10128ab2ca1) Signed-off-by: Jeff Vander Stoep Change-Id: I2e5bf1ce899a1018f1c5b8ccb4f7c816d61bba21 --- arch/arm64/kernel/stacktrace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index b947eeffa5b2..d916d5b6aef6 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -71,9 +71,17 @@ int notrace unwind_frame(struct stackframe *frame) * to task stack. * If we reach the end of the stack - and its an interrupt stack, * read the original task stack pointer from the dummy frame. + * + * Check the frame->fp we read from the bottom of the irq_stack, + * and the original task stack pointer are both in current->stack. */ - if (frame->sp == irq_stack_ptr) - frame->sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + if (frame->sp == irq_stack_ptr) { + unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + + if(object_is_on_stack((void *)orig_sp) && + object_is_on_stack((void *)frame->fp)) + frame->sp = orig_sp; + } return 0; } From 6674ba575b07febb6f025811e6afb64c15834b30 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 10 Dec 2015 10:22:41 +0000 Subject: [PATCH 0337/3220] UPSTREAM: arm64: don't call C code with el0's fp register On entry from el0, we save all the registers on the kernel stack, and restore them before returning. x29 remains unchanged when we call out to C code, which will store x29 as the frame-pointer on the stack. Instead, write 0 into x29 after entry from el0, to avoid any risk of tracing into user space. Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: per-cpu-irq-stack (cherry picked from commit 49003a8d6b35e128ef5e51433e60e783a46fbe5f) Signed-off-by: Jeff Vander Stoep Change-Id: Ifae7003018e4088d5de038cef25fa210211a75b6 --- arch/arm64/kernel/entry.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e394f8c9595a..2284c296e3f7 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -93,6 +93,8 @@ and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug disable_step_tsk x19, x20 // exceptions when scheduling. + + mov x29, xzr // fp pointed to user-space .else add x21, sp, #S_FRAME_SIZE .endif From eb87372aa93bc9849fe05a86eb0994e2bc755c45 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 23 Jun 2016 18:42:51 -0700 Subject: [PATCH 0338/3220] net: diag: Add support to filter on device index Add support to inet_diag facility to filter sockets based on device index. If an interface index is in the filter only sockets bound to that index (sk_bound_dev_if) are returned. [cherry-pick of net-next 637c841dd7a5f9bd97b75cbe90b526fa1a52e530] Change-Id: I6b6bcdcf15d3142003f1ee53b4d82f2fabbb8250 Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 68a1f71fde9f..b49bfb6ea22a 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -72,6 +72,7 @@ enum { INET_DIAG_BC_AUTO, INET_DIAG_BC_S_COND, INET_DIAG_BC_D_COND, + INET_DIAG_BC_DEV_COND, /* u32 ifindex */ }; struct inet_diag_hostcond { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index a403a676d452..e532671c3f4f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -44,6 +44,7 @@ struct inet_diag_entry { u16 dport; u16 family; u16 userlocks; + u32 ifindex; }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -552,6 +553,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } + case INET_DIAG_BC_DEV_COND: { + u32 ifindex; + + ifindex = *((const u32 *)(op + 1)); + if (ifindex != entry->ifindex) + yes = 0; + break; + } } if (yes) { @@ -594,6 +603,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry_fill_addrs(&entry, sk); entry.sport = inet->inet_num; entry.dport = ntohs(inet->inet_dport); + entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; return inet_diag_bc_run(bc, &entry); @@ -617,6 +627,17 @@ static int valid_cc(const void *bc, int len, int cc) return 0; } +/* data is u32 ifindex */ +static bool valid_devcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + /* Check ifindex space. */ + *min_len += sizeof(u32); + if (len < *min_len) + return false; + + return true; +} /* Validate an inet_diag_hostcond. */ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, int *min_len) @@ -681,6 +702,10 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) if (!valid_hostcond(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_DEV_COND: + if (!valid_devcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: From 5bd0c83ba48ce73eb89bd4308b07a263a07dd21e Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 24 Aug 2016 15:46:25 +0900 Subject: [PATCH 0339/3220] net: diag: slightly refactor the inet_diag_bc_audit error checks. This simplifies the code a bit and also allows inet_diag_bc_audit to send to userspace an error that isn't EINVAL. [cherry-pick of net-next 627cc4add53c0470bfd118002669205d222d3a54] Change-Id: Iee3d2bbb19f3110d71f0698ffb293f9bdffc8ef1 Signed-off-by: Lorenzo Colitti Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e532671c3f4f..75f6d4bde273 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -687,10 +687,16 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op, return true; } -static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) +static int inet_diag_bc_audit(const struct nlattr *attr) { - const void *bc = bytecode; - int len = bytecode_len; + const void *bytecode, *bc; + int bytecode_len, len; + + if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) + return -EINVAL; + + bytecode = bc = nla_data(attr); + len = bytecode_len = nla_len(attr); while (len > 0) { int min_len = sizeof(struct inet_diag_bc_op); @@ -1001,13 +1007,13 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlh->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(nlh, hdrlen)) { struct nlattr *attr; + int err; attr = nlmsg_find_attr(nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - if (!attr || - nla_len(attr) < sizeof(struct inet_diag_bc_op) || - inet_diag_bc_audit(nla_data(attr), nla_len(attr))) - return -EINVAL; + err = inet_diag_bc_audit(attr); + if (err) + return err; } { struct netlink_dump_control c = { @@ -1032,13 +1038,13 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) h->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(h, hdrlen)) { struct nlattr *attr; + int err; attr = nlmsg_find_attr(h, hdrlen, INET_DIAG_REQ_BYTECODE); - if (!attr || - nla_len(attr) < sizeof(struct inet_diag_bc_op) || - inet_diag_bc_audit(nla_data(attr), nla_len(attr))) - return -EINVAL; + err = inet_diag_bc_audit(attr); + if (err) + return err; } { struct netlink_dump_control c = { From ecd02bbc43641454b940fcf9c16b39ab8179af67 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 24 Aug 2016 15:46:26 +0900 Subject: [PATCH 0340/3220] net: diag: allow socket bytecode filters to match socket marks This allows a privileged process to filter by socket mark when dumping sockets via INET_DIAG_BY_FAMILY. This is useful on systems that use mark-based routing such as Android. The ability to filter socket marks requires CAP_NET_ADMIN, which is consistent with other privileged operations allowed by the SOCK_DIAG interface such as the ability to destroy sockets and the ability to inspect BPF filters attached to packet sockets. [cherry-pick of a52e95abf772b43c9226e9a72d3c1353903ba96f] Change-Id: I8b90b814264d9808bda050cdba8f104943bdb9a8 Tested: https://android-review.googlesource.com/261350 Signed-off-by: Lorenzo Colitti Acked-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 6 ++++++ net/ipv4/inet_diag.c | 36 +++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index b49bfb6ea22a..35435c348c60 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -73,6 +73,7 @@ enum { INET_DIAG_BC_S_COND, INET_DIAG_BC_D_COND, INET_DIAG_BC_DEV_COND, /* u32 ifindex */ + INET_DIAG_BC_MARK_COND, }; struct inet_diag_hostcond { @@ -82,6 +83,11 @@ struct inet_diag_hostcond { __be32 addr[0]; }; +struct inet_diag_markcond { + __u32 mark; + __u32 mask; +}; + /* Base info structure. It contains socket identity (addrs/ports/cookie) * and, alas, the information shown by netstat. */ struct inet_diag_msg { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 75f6d4bde273..5202149cbce0 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -45,6 +45,7 @@ struct inet_diag_entry { u16 family; u16 userlocks; u32 ifindex; + u32 mark; }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -561,6 +562,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } + case INET_DIAG_BC_MARK_COND: { + struct inet_diag_markcond *cond; + + cond = (struct inet_diag_markcond *)(op + 1); + if ((entry->mark & cond->mask) != cond->mark) + yes = 0; + break; + } } if (yes) { @@ -605,6 +614,12 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.dport = ntohs(inet->inet_dport); entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; + if (sk_fullsock(sk)) + entry.mark = sk->sk_mark; + else if (sk->sk_state == TCP_NEW_SYN_RECV) + entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else + entry.mark = 0; return inet_diag_bc_run(bc, &entry); } @@ -687,8 +702,17 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op, return true; } -static int inet_diag_bc_audit(const struct nlattr *attr) +static bool valid_markcond(const struct inet_diag_bc_op *op, int len, + int *min_len) { + *min_len += sizeof(struct inet_diag_markcond); + return len >= *min_len; +} + +static int inet_diag_bc_audit(const struct nlattr *attr, + const struct sk_buff *skb) +{ + bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); const void *bytecode, *bc; int bytecode_len, len; @@ -719,6 +743,12 @@ static int inet_diag_bc_audit(const struct nlattr *attr) if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_MARK_COND: + if (!net_admin) + return -EPERM; + if (!valid_markcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: @@ -1011,7 +1041,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) attr = nlmsg_find_attr(nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr); + err = inet_diag_bc_audit(attr, skb); if (err) return err; } @@ -1042,7 +1072,7 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) attr = nlmsg_find_attr(h, hdrlen, INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr); + err = inet_diag_bc_audit(attr, skb); if (err) return err; } From 2f500d61a4df5764d94abfab7cbc43e4e9b92881 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Aug 2016 21:06:33 -0700 Subject: [PATCH 0341/3220] net: diag: support SOCK_DESTROY for UDP sockets This implements SOCK_DESTROY for UDP sockets similar to what was done for TCP with commit c1e64e298b8ca ("net: diag: Support destroying TCP sockets.") A process with a UDP socket targeted for destroy is awakened and recvmsg fails with ECONNABORTED. [cherry-pick of 5d77dca82839ef016a93ad7acd7058b14d967752] Change-Id: I4b4862548e6e3c05dde27781e7daa0b18b93bd81 Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/udp.h | 1 + net/ipv4/udp.c | 15 +++++++++ net/ipv4/udp_diag.c | 79 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/udp.c | 1 + 4 files changed, 96 insertions(+) diff --git a/include/net/udp.h b/include/net/udp.h index 6d4ed18e1427..e57f50258cda 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -238,6 +238,7 @@ int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); void udp_err(struct sk_buff *, u32); +int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d8946929813e..660933edd2d2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2257,6 +2257,20 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) } EXPORT_SYMBOL(udp_poll); +int udp_abort(struct sock *sk, int err) +{ + lock_sock(sk); + + sk->sk_err = err; + sk->sk_error_report(sk); + udp_disconnect(sk, 0); + + release_sock(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(udp_abort); + struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, @@ -2288,6 +2302,7 @@ struct proto udp_prot = { .compat_getsockopt = compat_udp_getsockopt, #endif .clear_sk = sk_prot_clear_portaddr_nulls, + .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 6116604bf6e8..8b9570c84fd5 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -165,12 +165,88 @@ static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_wqueue = sk_wmem_alloc_get(sk); } +#ifdef CONFIG_INET_DIAG_DESTROY +static int __udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req, + struct udp_table *tbl) +{ + struct net *net = sock_net(in_skb->sk); + struct sock *sk; + int err; + + rcu_read_lock(); + + if (req->sdiag_family == AF_INET) + sk = __udp4_lib_lookup(net, + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_if, tbl); +#if IS_ENABLED(CONFIG_IPV6) + else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = __udp4_lib_lookup(net, + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_if, tbl); + + else + sk = __udp6_lib_lookup(net, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if, tbl); + } +#endif + else { + rcu_read_unlock(); + return -EINVAL; + } + + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + + rcu_read_unlock(); + + if (!sk) + return -ENOENT; + + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_put(sk); + return -ENOENT; + } + + err = sock_diag_destroy(sk, ECONNABORTED); + + sock_put(sk); + + return err; +} + +static int udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + return __udp_diag_destroy(in_skb, req, &udp_table); +} + +static int udplite_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + return __udp_diag_destroy(in_skb, req, &udplite_table); +} + +#endif + static const struct inet_diag_handler udp_diag_handler = { .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = udp_diag_destroy, +#endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -192,6 +268,9 @@ static const struct inet_diag_handler udplite_diag_handler = { .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = udplite_diag_destroy, +#endif }; static int __init udp_diag_init(void) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index a1b6adc20e1e..54235ef177bb 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1552,6 +1552,7 @@ struct proto udpv6_prot = { .compat_getsockopt = compat_udpv6_getsockopt, #endif .clear_sk = udp_v6_clear_sk, + .diag_destroy = udp_abort, }; static struct inet_protosw udpv6_protosw = { From 513e28e1983a31c52f30b4ca1b1e69e4c25b71fe Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 7 Sep 2016 13:38:35 +0900 Subject: [PATCH 0342/3220] net: diag: make udp_diag_destroy work for mapped addresses. udp_diag_destroy does look up the IPv4 UDP hashtable for mapped addresses, but it gets the IPv4 address to look up from the beginning of the IPv6 address instead of the end. [cherry-pick of net-next f95bf346226b9b79352e05508beececc807cc37a] Change-Id: Ia369482c4645bcade320b2c33a763f1ce4378ff1 Tested: https://android-review.googlesource.com/269874 Fixes: 5d77dca82839 ("net: diag: support SOCK_DESTROY for UDP sockets") Signed-off-by: Lorenzo Colitti Acked-by: Eric Dumazet Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/udp_diag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 8b9570c84fd5..39e0b8347bd2 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -186,8 +186,8 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, - req->id.idiag_dst[0], req->id.idiag_dport, - req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_dst[3], req->id.idiag_dport, + req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if, tbl); else From 662afd95b354bcf78a301838f345c59d9e164ad4 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Thu, 8 Sep 2016 00:42:25 +0900 Subject: [PATCH 0343/3220] net: inet: diag: expose the socket mark to privileged processes. This adds the capability for a process that has CAP_NET_ADMIN on a socket to see the socket mark in socket dumps. Commit a52e95abf772 ("net: diag: allow socket bytecode filters to match socket marks") recently gave privileged processes the ability to filter socket dumps based on mark. This patch is complementary: it ensures that the mark is also passed to userspace in the socket's netlink attributes. It is useful for tools like ss which display information about sockets. [backport of net-next d545caca827b65aab557a9e9dcdcf1e5a3823c2d] Change-Id: I33336ed9c3ee3fb78fe05c4c47b7fd18c6e33ef1 Tested: https://android-review.googlesource.com/270210 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 2 +- include/uapi/linux/inet_diag.h | 6 ++++- net/ipv4/inet_diag.c | 44 +++++++++++++++++++++++----------- net/ipv4/udp_diag.c | 10 ++++---- 4 files changed, 42 insertions(+), 20 deletions(-) diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 7c27fa1030e8..795852dc3434 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -37,7 +37,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 pid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh); + const struct nlmsghdr *unlh, bool net_admin); void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 35435c348c60..c7f189bd5979 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -120,9 +120,13 @@ enum { INET_DIAG_DCTCPINFO, INET_DIAG_PROTOCOL, /* response attribute only */ INET_DIAG_SKV6ONLY, + INET_DIAG_LOCALS, + INET_DIAG_PEERS, + INET_DIAG_PAD, + INET_DIAG_MARK, }; -#define INET_DIAG_MAX INET_DIAG_SKV6ONLY +#define INET_DIAG_MAX INET_DIAG_MARK /* INET_DIAG_MEM */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5202149cbce0..fcb83b2a61f0 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -98,6 +98,7 @@ static size_t inet_sk_attr_size(void) + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(4) /* INET_DIAG_MARK */ + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) @@ -110,7 +111,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { const struct inet_sock *inet = inet_sk(sk); const struct tcp_congestion_ops *ca_ops; @@ -160,6 +162,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } #endif + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark)) + goto errout; + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); @@ -258,10 +263,11 @@ static int inet_csk_diag_fill(struct sock *sk, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { - return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, - user_ns, portid, seq, nlmsg_flags, unlh); + return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns, + portid, seq, nlmsg_flags, unlh, net_admin); } static int inet_twsk_diag_fill(struct sock *sk, @@ -303,8 +309,9 @@ static int inet_twsk_diag_fill(struct sock *sk, static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, bool net_admin) { + struct request_sock *reqsk = inet_reqsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; @@ -318,7 +325,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, inet_diag_msg_common_fill(r, sk); r->idiag_state = TCP_SYN_RECV; r->idiag_timer = 1; - r->idiag_retrans = inet_reqsk(sk)->num_retrans; + r->idiag_retrans = reqsk->num_retrans; BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != offsetof(struct sock, sk_cookie)); @@ -330,6 +337,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, r->idiag_uid = 0; r->idiag_inode = 0; + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + inet_rsk(reqsk)->ir_mark)) + return -EMSGSIZE; + nlmsg_end(skb, nlh); return 0; } @@ -338,7 +349,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, const struct inet_diag_req_v2 *r, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) return inet_twsk_diag_fill(sk, skb, portid, seq, @@ -346,10 +357,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, portid, seq, - nlmsg_flags, unlh); + nlmsg_flags, unlh, net_admin); return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, - nlmsg_flags, unlh); + nlmsg_flags, unlh, net_admin); } struct sock *inet_diag_find_one_icsk(struct net *net, @@ -416,7 +427,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); nlmsg_free(rep); @@ -777,7 +789,8 @@ static int inet_csk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - const struct nlattr *bc) + const struct nlattr *bc, + bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -785,7 +798,8 @@ static int inet_csk_diag_dump(struct sock *sk, return inet_csk_diag_fill(sk, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, + net_admin); } static void twsk_build_assert(void) @@ -821,6 +835,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; u32 idiag_states = r->idiag_states; + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; @@ -862,7 +877,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, cb->args[3] > 0) goto next_listen; - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, + bc, net_admin) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -930,7 +946,7 @@ skip_listen_ht: sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh); + cb->nlh, net_admin); if (res < 0) { spin_unlock_bh(lock); goto done; diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 39e0b8347bd2..092aa60e8b92 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -20,7 +20,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc) + struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, return inet_sk_diag_fill(sk, NULL, skb, req, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, @@ -75,7 +75,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, err = inet_sk_diag_fill(sk, NULL, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); @@ -98,6 +99,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, { int num, s_num, slot, s_slot; struct net *net = sock_net(skb->sk); + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } From c47e4a12f511721dab3d2407d18e0e1a6b7b0e1c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Dec 2015 12:44:36 +0000 Subject: [PATCH 0344/3220] UPSTREAM: arm64: mm: remove pointless PAGE_MASKing As pgd_offset{,_k} shift the input address by PGDIR_SHIFT, the sub-page bits will always be shifted out. There is no need to apply PAGE_MASK before this. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit e2c30ee320eb96304896c7ab84499e5bc5e5fb6e) Signed-off-by: Jeff Vander Stoep Change-Id: I86d3438aecf7295d5895e1430c1e19fbc82c9719 --- arch/arm64/mm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 4c381f0e77a7..7b6e58a58bf3 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -288,7 +288,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), phys, virt, + __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, size, prot, early_alloc); } @@ -309,7 +309,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, return; } - return __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), + return __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, size, prot, late_alloc); } From 31400ef82e7ae74663e9ca8e041516ede5e617e4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Dec 2015 12:44:37 +0000 Subject: [PATCH 0345/3220] BACKPORT: arm64: Remove redundant padding from linker script Currently we place an ALIGN_DEBUG_RO between text and data for the .text and .init sections, and depending on configuration each of these may result in up to SECTION_SIZE bytes worth of padding (for DEBUG_RODATA_ALIGN). We make no distinction between the text and data in each of these sections at any point when creating the initial page tables in head.S. We also make no distinction when modifying the tables; __map_memblock, fixup_executable, mark_rodata_ro, and fixup_init only work at section granularity. Thus this padding is unnecessary. For the spit between init text and data we impose a minimum alignment of 16 bytes, but this is also unnecessary. The init data is output immediately after the padding before any symbols are defined, so this is not required to keep a symbol for linker a section array correctly associated with the data. Any objects within the section will be given at least their usual alignment regardless. This patch removes the redundant padding. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 5b28cd9d084eca8ddc46270d2720305bfd40e348) Signed-off-by: Jeff Vander Stoep Change-Id: I5540ba1f4d90e2ae8fafa22e98c389bc4e975ac7 --- arch/arm64/kernel/vmlinux.lds.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index f472809a7b45..1cc195872a1f 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -113,7 +113,6 @@ SECTIONS *(.got) /* Global offset table */ } - ALIGN_DEBUG_RO _etext = .; /* End of text section */ RO_DATA(PAGE_SIZE) @@ -129,7 +128,6 @@ SECTIONS ARM_EXIT_KEEP(EXIT_TEXT) } - ALIGN_DEBUG_RO_MIN(16) .init.data : { INIT_DATA INIT_SETUP(16) From 8ace694d891ede66ac585fa12aa76cb9c5530427 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Dec 2015 12:44:38 +0000 Subject: [PATCH 0346/3220] UPSTREAM: arm64: mm: fold alternatives into .init Currently we treat the alternatives separately from other data that's only used during initialisation, using separate .altinstructions and .altinstr_replacement linker sections. These are freed for general allocation separately from .init*. This is problematic as: * We do not remove execute permissions, as we do for .init, leaving the memory executable. * We pad between them, making the kernel Image bianry up to PAGE_SIZE bytes larger than necessary. This patch moves the two sections into the contiguous region used for .init*. This saves some memory, ensures that we remove execute permissions, and allows us to remove some code made redundant by this reorganisation. Signed-off-by: Mark Rutland Cc: Andre Przywara Cc: Catalin Marinas Cc: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 9aa4ec1571da62366cfddc20f3b923609604fe63) Signed-off-by: Jeff Vander Stoep Change-Id: I3fee118ead5c73ade50ea10d436881c9424a549c --- arch/arm64/include/asm/alternative.h | 1 - arch/arm64/kernel/alternative.c | 6 ------ arch/arm64/kernel/vmlinux.lds.S | 5 ++--- arch/arm64/mm/init.c | 1 - 4 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index d56ec0715157..e4962f04201e 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -19,7 +19,6 @@ struct alt_instr { void __init apply_alternatives_all(void); void apply_alternatives(void *start, size_t length); -void free_alternatives_memory(void); #define ALTINSTR_ENTRY(feature) \ " .word 661b - .\n" /* label */ \ diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index ab9db0e9818c..d2ee1b21a10d 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -158,9 +158,3 @@ void apply_alternatives(void *start, size_t length) __apply_alternatives(®ion); } - -void free_alternatives_memory(void) -{ - free_reserved_area(__alt_instructions, __alt_instructions_end, - 0, "alternatives"); -} diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 1cc195872a1f..1a056bb26633 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -142,9 +142,6 @@ SECTIONS PERCPU_SECTION(L1_CACHE_BYTES) - . = ALIGN(PAGE_SIZE); - __init_end = .; - . = ALIGN(4); .altinstructions : { __alt_instructions = .; @@ -156,6 +153,8 @@ SECTIONS } . = ALIGN(PAGE_SIZE); + __init_end = .; + _data = .; _sdata = .; RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index d43715a14383..f13078bbf66e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -362,7 +362,6 @@ void free_initmem(void) { fixup_init(); free_initmem_default(0); - free_alternatives_memory(); } #ifdef CONFIG_BLK_DEV_INITRD From c5ca0f872092c8c7fbcc903be244b63a33afd585 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 10 Dec 2015 16:54:32 +0000 Subject: [PATCH 0347/3220] UPSTREAM: arm64: cmpxchg: Don't incldue linux/mmdebug.h The arm64 asm/cmpxchg.h includes linux/mmdebug.h but doesn't so far as I can tell actually use anything from it. Removing the inclusion reduces spurious header dependency rebuilds and also avoids issues with recursive inclusions of headers causing build breaks due to attempts to use things before they are defined if linux/mmdebug.h starts pulling in more low level headers. Such errors have happened in -next recently, for example: In file included from include/linux/completion.h:11:0, from include/linux/rcupdate.h:43, from include/linux/tracepoint.h:19, from include/linux/mmdebug.h:6, from ./arch/arm64/include/asm/cmpxchg.h:22, from ./arch/arm64/include/asm/atomic.h:41, from include/linux/atomic.h:4, from include/linux/spinlock.h:406, from include/linux/seqlock.h:35, from include/linux/time.h:5, from include/uapi/linux/timex.h:56, from include/linux/timex.h:56, from include/linux/sched.h:19, from arch/arm64/kernel/asm-offsets.c:21: include/linux/wait.h: In function 'wait_on_atomic_t': include/linux/wait.h:1218:2: error: implicit declaration of function 'atomic_read' [-Werror=implicit-function-declaration] if (atomic_read(val) == 0) Signed-off-by: Mark Brown Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 4a6ccf30263f4e265c0f171561bf4c40bed5f273) Signed-off-by: Jeff Vander Stoep Change-Id: Idf44176dad0abc11e67b4e416b02a3fba14f3f1b --- arch/arm64/include/asm/cmpxchg.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h index 9ea611ea69df..510c7b404454 100644 --- a/arch/arm64/include/asm/cmpxchg.h +++ b/arch/arm64/include/asm/cmpxchg.h @@ -19,7 +19,6 @@ #define __ASM_CMPXCHG_H #include -#include #include #include From 2393897c7c7e10c2e8fe7d1c32466cbb885c9047 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 11 Dec 2015 11:04:31 +0000 Subject: [PATCH 0348/3220] UPSTREAM: arm64: mm: place __cpu_setup in .text We drop __cpu_setup in .text.init, which ends up being part of .text. The .text.init section was a legacy section name which has been unused elsewhere for a long time. The ".text.init" name is misleading if read as a synonym for ".init.text". Any CPU may execute __cpu_setup before turning the MMU on, so it should simply live in .text. Remove the pointless section assignment. This will leave __cpu_setup in the .text section. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit f00083cae331e5d3eecade6b4fdc35d0825e73ef) Signed-off-by: Jeff Vander Stoep Change-Id: I2e9b154cd6de92662c70c2b957479448252c661a --- arch/arm64/mm/proc.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index cacecc4ad3e5..b6f9053ab184 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -139,8 +139,6 @@ ENTRY(cpu_do_switch_mm) ret ENDPROC(cpu_do_switch_mm) - .section ".text.init", #alloc, #execinstr - /* * __cpu_setup * From 9e4d16282cb949c1ef3d9736de79c8155cb24d7b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 17 Nov 2015 14:45:47 +0000 Subject: [PATCH 0349/3220] UPSTREAM: arm64: Documentation: add list of software workarounds for errata It's not immediately obvious which hardware errata are worked around in the Linux kernel for an arbitrary kernel tree, so add a file to keep track of what we're working around. Acked-by: Catalin Marinas Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 9cb9c9e5ba8453537e8e645318edf231fe54eaf9) Signed-off-by: Jeff Vander Stoep Change-Id: I139972ff6e10e12c4b4f27cd047f55b3dd8b4118 --- Documentation/arm64/silicon-errata.txt | 58 ++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 Documentation/arm64/silicon-errata.txt diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt new file mode 100644 index 000000000000..58b71ddf9b60 --- /dev/null +++ b/Documentation/arm64/silicon-errata.txt @@ -0,0 +1,58 @@ + Silicon Errata and Software Workarounds + ======================================= + +Author: Will Deacon +Date : 27 November 2015 + +It is an unfortunate fact of life that hardware is often produced with +so-called "errata", which can cause it to deviate from the architecture +under specific circumstances. For hardware produced by ARM, these +errata are broadly classified into the following categories: + + Category A: A critical error without a viable workaround. + Category B: A significant or critical error with an acceptable + workaround. + Category C: A minor error that is not expected to occur under normal + operation. + +For more information, consult one of the "Software Developers Errata +Notice" documents available on infocenter.arm.com (registration +required). + +As far as Linux is concerned, Category B errata may require some special +treatment in the operating system. For example, avoiding a particular +sequence of code, or configuring the processor in a particular way. A +less common situation may require similar actions in order to declassify +a Category A erratum into a Category C erratum. These are collectively +known as "software workarounds" and are only required in the minority of +cases (e.g. those cases that both require a non-secure workaround *and* +can be triggered by Linux). + +For software workarounds that may adversely impact systems unaffected by +the erratum in question, a Kconfig entry is added under "Kernel +Features" -> "ARM errata workarounds via the alternatives framework". +These are enabled by default and patched in at runtime when an affected +CPU is detected. For less-intrusive workarounds, a Kconfig option is not +available and the code is structured (preferably with a comment) in such +a way that the erratum will not be hit. + +This approach can make it slightly onerous to determine exactly which +errata are worked around in an arbitrary kernel source tree, so this +file acts as a registry of software workarounds in the Linux Kernel and +will be updated when new workarounds are committed and backported to +stable kernels. + +| Implementor | Component | Erratum ID | Kconfig | ++----------------+-----------------+-----------------+-------------------------+ +| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 | +| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 | +| ARM | Cortex-A53 | #824069 | ARM64_ERRATUM_824069 | +| ARM | Cortex-A53 | #819472 | ARM64_ERRATUM_819472 | +| ARM | Cortex-A53 | #845719 | ARM64_ERRATUM_845719 | +| ARM | Cortex-A53 | #843419 | ARM64_ERRATUM_843419 | +| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 | +| ARM | Cortex-A57 | #852523 | N/A | +| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 | +| | | | | +| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 | +| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | From 41b7e19e9b27266024dc0565426a91d922c3d96b Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 15 Dec 2015 11:21:25 +0000 Subject: [PATCH 0350/3220] UPSTREAM: arm64: reduce stack use in irq_handler The code for switching to irq_stack stores three pieces of information on the stack, fp+lr, as a fake stack frame (that lets us walk back onto the interrupted tasks stack frame), and the address of the struct pt_regs that contains the register values from kernel entry. (which dump_backtrace() will print in any stack trace). To reduce this, we store fp, and the pointer to the struct pt_regs. unwind_frame() can recognise this as the irq_stack dummy frame, (as it only appears at the top of the irq_stack), and use the struct pt_regs values to find the missing interrupted link-register. Suggested-by: Will Deacon Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 971c67ce37cfeeaf560e792a2c3bc21d8b67163a) Signed-off-by: Jeff Vander Stoep Change-Id: I84cbb04857a441083d331e875c3e228d24ec2276 --- arch/arm64/include/asm/irq.h | 11 ++++------- arch/arm64/kernel/entry.S | 12 +++++++----- arch/arm64/kernel/stacktrace.c | 19 ++++++++++++++++--- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 877c7e358384..3bece4379bd9 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -25,16 +25,13 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * ------------ * | | <- irq_stack_ptr * top ------------ - * | elr_el1 | + * | x19 | <- irq_stack_ptr - 0x08 * ------------ * | x29 | <- irq_stack_ptr - 0x10 * ------------ - * | xzr | - * ------------ - * | x19 | <- irq_stack_ptr - 0x20 - * ------------ * - * where x19 holds a copy of the task stack pointer. + * where x19 holds a copy of the task stack pointer where the struct pt_regs + * from kernel_entry can be found. * */ #define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) @@ -43,7 +40,7 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * The offset from irq_stack_ptr where entry.S will store the original * stack pointer. Used by unwind_frame() and dump_backtrace(). */ -#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x20)); +#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08))) extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2284c296e3f7..0667fb7d8bb1 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -178,7 +178,7 @@ alternative_endif mrs \rd, sp_el0 .endm - .macro irq_stack_entry, dummy_lr + .macro irq_stack_entry mov x19, sp // preserve the original sp this_cpu_ptr irq_stack, x25, x26 @@ -196,10 +196,12 @@ alternative_endif add x26, x25, x26 mov sp, x26 - /* Add a dummy stack frame */ - stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame + /* + * Add a dummy stack frame, this non-standard format is fixed up + * by unwind_frame() + */ + stp x29, x19, [sp, #-16]! mov x29, sp - stp x19, xzr, [sp, #-16]! 9998: .endm @@ -229,7 +231,7 @@ tsk .req x28 // current thread_info .macro irq_handler ldr_l x1, handle_arch_irq mov x0, sp - irq_stack_entry x22 + irq_stack_entry blr x1 irq_stack_exit .endm diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index d916d5b6aef6..b9fd3a8abfc1 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -70,17 +70,30 @@ int notrace unwind_frame(struct stackframe *frame) * Check whether we are going to walk through from interrupt stack * to task stack. * If we reach the end of the stack - and its an interrupt stack, - * read the original task stack pointer from the dummy frame. + * unpack the dummy frame to find the original elr. * * Check the frame->fp we read from the bottom of the irq_stack, * and the original task stack pointer are both in current->stack. */ if (frame->sp == irq_stack_ptr) { + struct pt_regs *irq_args; unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); - if(object_is_on_stack((void *)orig_sp) && - object_is_on_stack((void *)frame->fp)) + if (object_is_on_stack((void *)orig_sp) && + object_is_on_stack((void *)frame->fp)) { frame->sp = orig_sp; + + /* orig_sp is the saved pt_regs, find the elr */ + irq_args = (struct pt_regs *)orig_sp; + frame->pc = irq_args->pc; + } else { + /* + * This frame has a non-standard format, and we + * didn't fix it, because the data looked wrong. + * Refuse to output this frame. + */ + return -EINVAL; + } } return 0; From b10551a35399a277afe9b2e28266e2a00a5f6253 Mon Sep 17 00:00:00 2001 From: Ashok Kumar Date: Thu, 17 Dec 2015 01:38:31 -0800 Subject: [PATCH 0351/3220] UPSTREAM: arm64: Defer dcache flush in __cpu_copy_user_page Defer dcache flushing to __sync_icache_dcache by calling flush_dcache_page which clears PG_dcache_clean flag. Acked-by: Catalin Marinas Signed-off-by: Ashok Kumar Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit e6b1185f77351aa154e63bd54b05d07ff99d4ffa) Signed-off-by: Jeff Vander Stoep Change-Id: I2e02ce2f43f68287337bed30e3c3455c0eee4164 --- arch/arm64/mm/copypage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 13bbc3be6f5a..22e4cb4d6f53 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -24,8 +24,9 @@ void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr) { + struct page *page = virt_to_page(kto); copy_page(kto, kfrom); - __flush_dcache_area(kto, PAGE_SIZE); + flush_dcache_page(page); } EXPORT_SYMBOL_GPL(__cpu_copy_user_page); From 350a6a0fe0fe70d15a59084c8f4863d10271887f Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Wed, 13 Jan 2016 14:50:03 +0000 Subject: [PATCH 0352/3220] BACKPORT: arm64: kernel: fix architected PMU registers unconditional access The Performance Monitors extension is an optional feature of the AArch64 architecture, therefore, in order to access Performance Monitors registers safely, the kernel should detect the architected PMU unit presence through the ID_AA64DFR0_EL1 register PMUVer field before accessing them. This patch implements a guard by reading the ID_AA64DFR0_EL1 register PMUVer field to detect the architected PMU presence and prevent accessing PMU system registers if the Performance Monitors extension is not implemented in the core. Cc: Peter Maydell Cc: Mark Rutland Cc: Fixes: 60792ad349f3 ("arm64: kernel: enforce pmuserenr_el0 initialization and restore") Signed-off-by: Lorenzo Pieralisi Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit f436b2ac90a095746beb6729b8ee8ed87c9eaede) Signed-off-by: Jeff Vander Stoep Change-Id: Ie442b9feba5d143bd6b6c82d70190fc8bc95298d --- arch/arm64/kernel/head.S | 5 +++++ arch/arm64/mm/proc-macros.S | 12 ++++++++++++ arch/arm64/mm/proc.S | 2 ++ 3 files changed, 19 insertions(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index b363f340f2c7..17ce7285bb12 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -515,9 +515,14 @@ CPU_LE( movk x0, #0x30d0, lsl #16 ) // Clear EE and E0E on LE systems #endif /* EL2 debug */ + mrs x0, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx x0, x0, #8, #4 + cmp x0, #1 + b.lt 4f // Skip if no PMU present mrs x0, pmcr_el0 // Disable debug access traps ubfx x0, x0, #11, #5 // to EL2 and allow access to msr mdcr_el2, x0 // all PMU counters from EL1 +4: /* Stage-2 translation */ msr vttbr_el2, xzr diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S index 4c4d93c4bf65..d69dffffaa89 100644 --- a/arch/arm64/mm/proc-macros.S +++ b/arch/arm64/mm/proc-macros.S @@ -62,3 +62,15 @@ bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH #endif .endm + +/* + * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present + */ + .macro reset_pmuserenr_el0, tmpreg + mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx \tmpreg, \tmpreg, #8, #4 + cmp \tmpreg, #1 // Skip if no PMU present + b.lt 9000f + msr pmuserenr_el0, xzr // Disable PMU access from EL0 +9000: + .endm diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index b6f9053ab184..c164d2cb35c0 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -117,6 +117,7 @@ ENTRY(cpu_do_resume) */ ubfx x11, x11, #1, #1 msr oslar_el1, x11 + reset_pmuserenr_el0 x0 // Disable PMU access from EL0 mov x0, x12 dsb nsh // Make sure local tlb invalidation completed isb @@ -153,6 +154,7 @@ ENTRY(__cpu_setup) msr cpacr_el1, x0 // Enable FP/ASIMD mov x0, #1 << 12 // Reset mdscr_el1 and disable msr mdscr_el1, x0 // access to the DCC from EL0 + reset_pmuserenr_el0 x0 // Disable PMU access from EL0 /* * Memory region attributes for LPAE: * From d378add20954990566e91f914105e2638564d8ba Mon Sep 17 00:00:00 2001 From: Ashok Kumar Date: Thu, 17 Dec 2015 01:38:32 -0800 Subject: [PATCH 0353/3220] BACKPORT: arm64: Use PoU cache instr for I/D coherency In systems with three levels of cache(PoU at L1 and PoC at L3), PoC cache flush instructions flushes L2 and L3 caches which could affect performance. For cache flushes for I and D coherency, PoU should suffice. So changing all I and D coherency related cache flushes to PoU. Introduced a new __clean_dcache_area_pou API for dcache flush till PoU and provided a common macro for __flush_dcache_area and __clean_dcache_area_pou. Also, now in __sync_icache_dcache, icache invalidation for non-aliasing VIPT icache is done only for that particular page instead of the earlier __flush_icache_all. Reviewed-by: Catalin Marinas Reviewed-by: Mark Rutland Signed-off-by: Ashok Kumar Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 0a28714c53fd4f7aea709be7577dfbe0095c8c3e) Signed-off-by: Jeff Vander Stoep Change-Id: I64f065140d5e8783e91ed53ae9c7a2e33a3e515a --- arch/arm64/include/asm/cacheflush.h | 1 + arch/arm64/mm/cache.S | 28 ++++++++++++++---------- arch/arm64/mm/flush.c | 33 ++++++++++++++++------------- arch/arm64/mm/proc-macros.S | 22 +++++++++++++++++++ 4 files changed, 58 insertions(+), 26 deletions(-) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 54efedaf331f..7fc294c3bc5b 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -68,6 +68,7 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); +extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); static inline void flush_cache_mm(struct mm_struct *mm) diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index cfa44a6adc0a..6df07069a025 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -81,25 +81,31 @@ ENDPROC(__flush_cache_user_range) /* * __flush_dcache_area(kaddr, size) * - * Ensure that the data held in the page kaddr is written back to the - * page in question. + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned and invalidated to the PoC. * * - kaddr - kernel address * - size - size in question */ ENTRY(__flush_dcache_area) - dcache_line_size x2, x3 - add x1, x0, x1 - sub x3, x2, #1 - bic x0, x0, x3 -1: dc civac, x0 // clean & invalidate D line / unified line - add x0, x0, x2 - cmp x0, x1 - b.lo 1b - dsb sy + dcache_by_line_op civac, sy, x0, x1, x2, x3 ret ENDPIPROC(__flush_dcache_area) +/* + * __clean_dcache_area_pou(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned to the PoU. + * + * - kaddr - kernel address + * - size - size in question + */ +ENTRY(__clean_dcache_area_pou) + dcache_by_line_op cvau, ish, x0, x1, x2, x3 + ret +ENDPROC(__clean_dcache_area_pou) + /* * __inval_cache_range(start, end) * - start - start address of region diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index c26b804015e8..46649d6e6c5a 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -34,19 +34,24 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, __flush_icache_all(); } +static void sync_icache_aliases(void *kaddr, unsigned long len) +{ + unsigned long addr = (unsigned long)kaddr; + + if (icache_is_aliasing()) { + __clean_dcache_area_pou(kaddr, len); + __flush_icache_all(); + } else { + flush_icache_range(addr, addr + len); + } +} + static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *kaddr, unsigned long len) { - if (vma->vm_flags & VM_EXEC) { - unsigned long addr = (unsigned long)kaddr; - if (icache_is_aliasing()) { - __flush_dcache_area(kaddr, len); - __flush_icache_all(); - } else { - flush_icache_range(addr, addr + len); - } - } + if (vma->vm_flags & VM_EXEC) + sync_icache_aliases(kaddr, len); } /* @@ -74,13 +79,11 @@ void __sync_icache_dcache(pte_t pte, unsigned long addr) if (!page_mapping(page)) return; - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) { - __flush_dcache_area(page_address(page), - PAGE_SIZE << compound_order(page)); + if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + sync_icache_aliases(page_address(page), + PAGE_SIZE << compound_order(page)); + else if (icache_is_aivivt()) __flush_icache_all(); - } else if (icache_is_aivivt()) { - __flush_icache_all(); - } } /* diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S index d69dffffaa89..984edcda1850 100644 --- a/arch/arm64/mm/proc-macros.S +++ b/arch/arm64/mm/proc-macros.S @@ -74,3 +74,25 @@ msr pmuserenr_el0, xzr // Disable PMU access from EL0 9000: .endm + +/* + * Macro to perform a data cache maintenance for the interval + * [kaddr, kaddr + size) + * + * op: operation passed to dc instruction + * domain: domain used in dsb instruciton + * kaddr: starting virtual address of the region + * size: size of the region + * Corrupts: kaddr, size, tmp1, tmp2 + */ + .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 + dcache_line_size \tmp1, \tmp2 + add \size, \kaddr, \size + sub \tmp2, \tmp1, #1 + bic \kaddr, \kaddr, \tmp2 +9998: dc \op, \kaddr + add \kaddr, \kaddr, \tmp1 + cmp \kaddr, \size + b.lo 9998b + dsb \domain + .endm From 357838314d69e67ff89db49a7170fa1968638b88 Mon Sep 17 00:00:00 2001 From: David Woods Date: Thu, 17 Dec 2015 14:31:26 -0500 Subject: [PATCH 0354/3220] UPSTREAM: arm64: hugetlb: add support for PTE contiguous bit The arm64 MMU supports a Contiguous bit which is a hint that the TTE is one of a set of contiguous entries which can be cached in a single TLB entry. Supporting this bit adds new intermediate huge page sizes. The set of huge page sizes available depends on the base page size. Without using contiguous pages the huge page sizes are as follows. 4KB: 2MB 1GB 64KB: 512MB With a 4KB granule, the contiguous bit groups together sets of 16 pages and with a 64KB granule it groups sets of 32 pages. This enables two new huge page sizes in each case, so that the full set of available sizes is as follows. 4KB: 64KB 2MB 32MB 1GB 64KB: 2MB 512MB 16GB If a 16KB granule is used then the contiguous bit groups 128 pages at the PTE level and 32 pages at the PMD level. If the base page size is set to 64KB then 2MB pages are enabled by default. It is possible in the future to make 2MB the default huge page size for both 4KB and 64KB granules. Reviewed-by: Chris Metcalf Reviewed-by: Steve Capper Signed-off-by: David Woods Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 66b3923a1a0f77a563b43f43f6ad091354abbfe9) Signed-off-by: Jeff Vander Stoep Change-Id: I5e99c5165bc5eb966adf4d4523632fd9eedd9602 --- arch/arm64/Kconfig | 3 - arch/arm64/include/asm/hugetlb.h | 44 ++-- arch/arm64/include/asm/pgtable-hwdef.h | 18 +- arch/arm64/include/asm/pgtable.h | 10 +- arch/arm64/mm/hugetlbpage.c | 274 ++++++++++++++++++++++++- include/linux/hugetlb.h | 2 - 6 files changed, 313 insertions(+), 38 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 92f234a5579b..ac3dbd933064 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -564,9 +564,6 @@ config HW_PERF_EVENTS config SYS_SUPPORTS_HUGETLBFS def_bool y -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config ARCH_WANT_HUGE_PMD_SHARE def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index bb4052e85dba..bbc1e35aa601 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -26,36 +26,7 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return *ptep; } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - ptep_clear_flush(vma, addr, ptep); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -97,4 +68,19 @@ static inline void arch_clear_hugepage_flags(struct page *page) clear_bit(PG_dcache_clean, &page->flags); } +extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable); +#define arch_make_huge_pte arch_make_huge_pte +extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty); +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); +extern void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); +extern void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index d6739e836f7b..5c25b831273d 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -90,7 +90,23 @@ /* * Contiguous page definitions. */ -#define CONT_PTES (_AC(1, UL) << CONT_SHIFT) +#ifdef CONFIG_ARM64_64K_PAGES +#define CONT_PTE_SHIFT 5 +#define CONT_PMD_SHIFT 5 +#elif defined(CONFIG_ARM64_16K_PAGES) +#define CONT_PTE_SHIFT 7 +#define CONT_PMD_SHIFT 5 +#else +#define CONT_PTE_SHIFT 4 +#define CONT_PMD_SHIFT 4 +#endif + +#define CONT_PTES (1 << CONT_PTE_SHIFT) +#define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE) +#define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1)) +#define CONT_PMDS (1 << CONT_PMD_SHIFT) +#define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE) +#define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1)) /* the the numerical offset of the PTE within a range of CONT_PTES */ #define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1)) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 824416a219e7..51f382b7e5c7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -227,7 +227,8 @@ static inline pte_t pte_mkspecial(pte_t pte) static inline pte_t pte_mkcont(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_CONT)); + pte = set_pte_bit(pte, __pgprot(PTE_CONT)); + return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE)); } static inline pte_t pte_mknoncont(pte_t pte) @@ -235,6 +236,11 @@ static inline pte_t pte_mknoncont(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_CONT)); } +static inline pmd_t pmd_mkcont(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | PMD_SECT_CONT); +} + static inline void set_pte(pte_t *ptep, pte_t pte) { *ptep = pte; @@ -308,7 +314,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, /* * Hugetlb definitions. */ -#define HUGE_MAX_HSTATE 2 +#define HUGE_MAX_HSTATE 4 #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 383b03ff38f8..82d607c3614e 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -41,17 +41,289 @@ int pud_huge(pud_t pud) #endif } +static int find_num_contig(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, size_t *pgsize) +{ + pgd_t *pgd = pgd_offset(mm, addr); + pud_t *pud; + pmd_t *pmd; + + *pgsize = PAGE_SIZE; + if (!pte_cont(pte)) + return 1; + if (!pgd_present(*pgd)) { + VM_BUG_ON(!pgd_present(*pgd)); + return 1; + } + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) { + VM_BUG_ON(!pud_present(*pud)); + return 1; + } + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + VM_BUG_ON(!pmd_present(*pmd)); + return 1; + } + if ((pte_t *)pmd == ptep) { + *pgsize = PMD_SIZE; + return CONT_PMDS; + } + return CONT_PTES; +} + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + size_t pgsize; + int i; + int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize); + unsigned long pfn; + pgprot_t hugeprot; + + if (ncontig == 1) { + set_pte_at(mm, addr, ptep, pte); + return; + } + + pfn = pte_pfn(pte); + hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); + for (i = 0; i < ncontig; i++) { + pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep, + pte_val(pfn_pte(pfn, hugeprot))); + set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + ptep++; + pfn += pgsize >> PAGE_SHIFT; + addr += pgsize; + } +} + +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + pud_t *pud; + pte_t *pte = NULL; + + pr_debug("%s: addr:0x%lx sz:0x%lx\n", __func__, addr, sz); + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return NULL; + + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else if (sz == (PAGE_SIZE * CONT_PTES)) { + pmd_t *pmd = pmd_alloc(mm, pud, addr); + + WARN_ON(addr & (sz - 1)); + /* + * Note that if this code were ever ported to the + * 32-bit arm platform then it will cause trouble in + * the case where CONFIG_HIGHPTE is set, since there + * will be no pte_unmap() to correspond with this + * pte_alloc_map(). + */ + pte = pte_alloc_map(mm, NULL, pmd, addr); + } else if (sz == PMD_SIZE) { + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && + pud_none(*pud)) + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); + } else if (sz == (PMD_SIZE * CONT_PMDS)) { + pmd_t *pmd; + + pmd = pmd_alloc(mm, pud, addr); + WARN_ON(addr & (sz - 1)); + return (pte_t *)pmd; + } + + pr_debug("%s: addr:0x%lx sz:0x%lx ret pte=%p/0x%llx\n", __func__, addr, + sz, pte, pte_val(*pte)); + return pte; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd); + if (!pgd_present(*pgd)) + return NULL; + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return NULL; + + if (pud_huge(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return NULL; + + if (pte_cont(pmd_pte(*pmd))) { + pmd = pmd_offset( + pud, (addr & CONT_PMD_MASK)); + return (pte_t *)pmd; + } + if (pmd_huge(*pmd)) + return (pte_t *)pmd; + pte = pte_offset_kernel(pmd, addr); + if (pte_present(*pte) && pte_cont(*pte)) { + pte = pte_offset_kernel( + pmd, (addr & CONT_PTE_MASK)); + return pte; + } + return NULL; +} + +pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable) +{ + size_t pagesize = huge_page_size(hstate_vma(vma)); + + if (pagesize == CONT_PTE_SIZE) { + entry = pte_mkcont(entry); + } else if (pagesize == CONT_PMD_SIZE) { + entry = pmd_pte(pmd_mkcont(pte_pmd(entry))); + } else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) { + pr_warn("%s: unrecognized huge page size 0x%lx\n", + __func__, pagesize); + } + return entry; +} + +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte; + + if (pte_cont(*ptep)) { + int ncontig, i; + size_t pgsize; + pte_t *cpte; + bool is_dirty = false; + + cpte = huge_pte_offset(mm, addr); + ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + /* save the 1st pte to return */ + pte = ptep_get_and_clear(mm, addr, cpte); + for (i = 1; i < ncontig; ++i) { + /* + * If HW_AFDBM is enabled, then the HW could + * turn on the dirty bit for any of the page + * in the set, so check them all. + */ + ++cpte; + if (pte_dirty(ptep_get_and_clear(mm, addr, cpte))) + is_dirty = true; + } + if (is_dirty) + return pte_mkdirty(pte); + else + return pte; + } else { + return ptep_get_and_clear(mm, addr, ptep); + } +} + +int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ + pte_t *cpte; + + if (pte_cont(pte)) { + int ncontig, i, changed = 0; + size_t pgsize = 0; + unsigned long pfn = pte_pfn(pte); + /* Select all bits except the pfn */ + pgprot_t hugeprot = + __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ + pte_val(pte)); + + cpte = huge_pte_offset(vma->vm_mm, addr); + pfn = pte_pfn(*cpte); + ncontig = find_num_contig(vma->vm_mm, addr, cpte, + *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) { + changed = ptep_set_access_flags(vma, addr, cpte, + pfn_pte(pfn, + hugeprot), + dirty); + pfn += pgsize >> PAGE_SHIFT; + } + return changed; + } else { + return ptep_set_access_flags(vma, addr, ptep, pte, dirty); + } +} + +void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + if (pte_cont(*ptep)) { + int ncontig, i; + pte_t *cpte; + size_t pgsize = 0; + + cpte = huge_pte_offset(mm, addr); + ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) + ptep_set_wrprotect(mm, addr, cpte); + } else { + ptep_set_wrprotect(mm, addr, ptep); + } +} + +void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + if (pte_cont(*ptep)) { + int ncontig, i; + pte_t *cpte; + size_t pgsize = 0; + + cpte = huge_pte_offset(vma->vm_mm, addr); + ncontig = find_num_contig(vma->vm_mm, addr, cpte, + *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) + ptep_clear_flush(vma, addr, cpte); + } else { + ptep_clear_flush(vma, addr, ptep); + } +} + static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); + if (ps == PMD_SIZE) { hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); } else if (ps == PUD_SIZE) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else if (ps == (PAGE_SIZE * CONT_PTES)) { + hugetlb_add_hstate(CONT_PTE_SHIFT); + } else if (ps == (PMD_SIZE * CONT_PMDS)) { + hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT); } else { - pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20); + pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); return 0; } return 1; } __setup("hugepagesz=", setup_hugepagesz); + +#ifdef CONFIG_ARM64_64K_PAGES +static __init int add_default_hugepagesz(void) +{ + if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) + hugetlb_add_hstate(CONT_PMD_SHIFT); + return 0; +} +arch_initcall(add_default_hugepagesz); +#endif diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 685c262e0be8..b0eb06423d5e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -96,9 +96,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct address_space *mapping, pgoff_t idx, unsigned long address); -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -#endif extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; From 0c6cdfec1241b011aeecb1cc0b4d69f122a8d075 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 18 Dec 2015 16:01:47 +0000 Subject: [PATCH 0355/3220] UPSTREAM: arm64: remove irq_count and do_softirq_own_stack() sysrq_handle_reboot() re-enables interrupts while on the irq stack. The irq_stack implementation wrongly assumed this would only ever happen via the softirq path, allowing it to update irq_count late, in do_softirq_own_stack(). This means if an irq occurs in sysrq_handle_reboot(), during emergency_restart() the stack will be corrupted, as irq_count wasn't updated. Lose the optimisation, and instead of moving the adding/subtracting of irq_count into irq_stack_entry/irq_stack_exit, remove it, and compare sp_el0 (struct thread_info) with sp & ~(THREAD_SIZE - 1). This tells us if we are on a task stack, if so, we can safely switch to the irq stack. Finally, remove do_softirq_own_stack(), we don't need it anymore. Reported-by: Will Deacon Signed-off-by: James Morse [will: use get_thread_info macro] Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit d224a69e3d80fe08f285d1f41d21b590bae4fa9f) Signed-off-by: Jeff Vander Stoep Change-Id: I1f613279bf875443b10d65b1cd1ed4a6abfcb605 --- arch/arm64/include/asm/irq.h | 2 -- arch/arm64/kernel/entry.S | 19 +++++++++--------- arch/arm64/kernel/irq.c | 38 +----------------------------------- 3 files changed, 11 insertions(+), 48 deletions(-) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 3bece4379bd9..b77197d941fc 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -11,8 +11,6 @@ #include #include -#define __ARCH_HAS_DO_SOFTIRQ - struct pt_regs; DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 0667fb7d8bb1..c0db321db7e1 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -181,19 +181,20 @@ alternative_endif .macro irq_stack_entry mov x19, sp // preserve the original sp - this_cpu_ptr irq_stack, x25, x26 - /* - * Check the lowest address on irq_stack for the irq_count value, - * incremented by do_softirq_own_stack if we have re-enabled irqs - * while on the irq_stack. + * Compare sp with the current thread_info, if the top + * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and + * should switch to the irq stack. */ - ldr x26, [x25] - cbnz x26, 9998f // recursive use? + and x25, x19, #~(THREAD_SIZE - 1) + cmp x25, tsk + b.ne 9998f - /* switch to the irq stack */ + this_cpu_ptr irq_stack, x25, x26 mov x26, #IRQ_STACK_START_SP add x26, x25, x26 + + /* switch to the irq stack */ mov sp, x26 /* @@ -405,10 +406,10 @@ el1_irq: bl trace_hardirqs_off #endif + get_thread_info tsk irq_handler #ifdef CONFIG_PREEMPT - get_thread_info tsk ldr w24, [tsk, #TI_PREEMPT] // get preempt count cbnz w24, 1f // preempt count != 0 ldr x0, [tsk, #TI_FLAGS] // get flags diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index ff7ebb710e51..2386b26c0712 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -25,24 +25,14 @@ #include #include #include -#include #include #include unsigned long irq_err_count; -/* - * irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. - * irq_stack[0] is used as irq_count, a non-zero value indicates the stack - * is in use, and el?_irq() shouldn't switch to it. This is used to detect - * recursive use of the irq_stack, it is lazily updated by - * do_softirq_own_stack(), which is called on the irq_stack, before - * re-enabling interrupts to process softirqs. - */ +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); -#define IRQ_COUNT() (*per_cpu(irq_stack, smp_processor_id())) - int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); @@ -66,29 +56,3 @@ void __init init_IRQ(void) if (!handle_arch_irq) panic("No interrupt controller found."); } - -/* - * do_softirq_own_stack() is called from irq_exit() before __do_softirq() - * re-enables interrupts, at which point we may re-enter el?_irq(). We - * increase irq_count here so that el1_irq() knows that it is already on the - * irq stack. - * - * Called with interrupts disabled, so we don't worry about moving cpu, or - * being interrupted while modifying irq_count. - * - * This function doesn't actually switch stack. - */ -void do_softirq_own_stack(void) -{ - int cpu = smp_processor_id(); - - WARN_ON_ONCE(!irqs_disabled()); - - if (on_irq_stack(current_stack_pointer, cpu)) { - IRQ_COUNT()++; - __do_softirq(); - IRQ_COUNT()--; - } else { - __do_softirq(); - } -} From cc6027395e152f2a7aee43623bac1d3bdd8f5da0 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:39 +0900 Subject: [PATCH 0356/3220] UPSTREAM: arm64: ftrace: modify a stack frame in a safe way Function graph tracer modifies a return address (LR) in a stack frame by calling ftrace_prepare_return() in a traced function's function prologue. The current code does this modification before preserving an original address at ftrace_push_return_trace() and there is always a small window of inconsistency when an interrupt occurs. This doesn't matter, as far as an interrupt stack is introduced, because stack tracer won't be invoked in an interrupt context. But it would be better to proactively minimize such a window by moving the LR modification after ftrace_push_return_trace(). Signed-off-by: AKASHI Takahiro Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 79fdee9b6355c9720f14717e1ad66af51bb331b5) Signed-off-by: Jeff Vander Stoep Change-Id: Ief0a136aa01348b4d0d3f447544f21fd77835c67 --- arch/arm64/kernel/ftrace.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 8f7005bc35bd..ebecf9aa33d1 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -129,23 +129,20 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, * on other archs. It's unlikely on AArch64. */ old = *parent; - *parent = return_hooker; trace.func = self_addr; trace.depth = current->curr_ret_stack + 1; /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) { - *parent = old; + if (!ftrace_graph_entry(&trace)) return; - } err = ftrace_push_return_trace(old, self_addr, &trace.depth, frame_pointer); - if (err == -EBUSY) { - *parent = old; + if (err == -EBUSY) return; - } + else + *parent = return_hooker; } #ifdef CONFIG_DYNAMIC_FTRACE From 6adbc95c63d8b5a2873865ee8b6a4092ac95a0e9 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:40 +0900 Subject: [PATCH 0357/3220] UPSTREAM: arm64: pass a task parameter to unwind_frame() Function graph tracer modifies a return address (LR) in a stack frame to hook a function's return. This will result in many useless entries (return_to_handler) showing up in a call stack list. We will fix this problem in a later patch ("arm64: ftrace: fix a stack tracer's output under function graph tracer"). But since real return addresses are saved in ret_stack[] array in struct task_struct, unwind functions need to be notified of, in addition to a stack pointer address, which task is being traced in order to find out real return addresses. This patch extends unwind functions' interfaces by adding an extra argument of a pointer to task_struct. Signed-off-by: AKASHI Takahiro Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit fe13f95b720075327a761fe6ddb45b0c90cab504) Signed-off-by: Jeff Vander Stoep Change-Id: I92a9a07468c182d5abbacaa73a90984ab11ad535 --- arch/arm64/include/asm/stacktrace.h | 6 ++++-- arch/arm64/kernel/perf_callchain.c | 2 +- arch/arm64/kernel/process.c | 2 +- arch/arm64/kernel/return_address.c | 2 +- arch/arm64/kernel/stacktrace.c | 8 ++++---- arch/arm64/kernel/time.c | 2 +- arch/arm64/kernel/traps.c | 2 +- 7 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 7318f6d54aa9..6fb61c5090b4 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -16,14 +16,16 @@ #ifndef __ASM_STACKTRACE_H #define __ASM_STACKTRACE_H +struct task_struct; + struct stackframe { unsigned long fp; unsigned long sp; unsigned long pc; }; -extern int unwind_frame(struct stackframe *frame); -extern void walk_stackframe(struct stackframe *frame, +extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); +extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 3aa74830cc69..797220da912b 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -165,7 +165,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, frame.sp = regs->sp; frame.pc = regs->pc; - walk_stackframe(&frame, callchain_trace, entry); + walk_stackframe(current, &frame, callchain_trace, entry); } unsigned long perf_instruction_pointer(struct pt_regs *regs) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 62068ded1e90..51be6356fb03 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -414,7 +414,7 @@ unsigned long get_wchan(struct task_struct *p) do { if (frame.sp < stack_page || frame.sp >= stack_page + THREAD_SIZE || - unwind_frame(&frame)) + unwind_frame(p, &frame)) return 0; if (!in_sched_functions(frame.pc)) return frame.pc; diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 6c4fd2810ecb..07b37ac05be4 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -44,7 +44,7 @@ void *return_address(unsigned int level) frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ - walk_stackframe(&frame, save_return_addr, &data); + walk_stackframe(current, &frame, save_return_addr, &data); if (!data.level) return data.addr; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index b9fd3a8abfc1..f7ee597ec883 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -36,7 +36,7 @@ * ldp x29, x30, [sp] * add sp, sp, #0x10 */ -int notrace unwind_frame(struct stackframe *frame) +int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; @@ -99,7 +99,7 @@ int notrace unwind_frame(struct stackframe *frame) return 0; } -void notrace walk_stackframe(struct stackframe *frame, +void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data) { while (1) { @@ -107,7 +107,7 @@ void notrace walk_stackframe(struct stackframe *frame, if (fn(frame, data)) break; - ret = unwind_frame(frame); + ret = unwind_frame(tsk, frame); if (ret < 0) break; } @@ -159,7 +159,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) frame.pc = (unsigned long)save_stack_trace_tsk; } - walk_stackframe(&frame, save_trace, &data); + walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 13339b6ffc1a..6e5c521f123a 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -53,7 +53,7 @@ unsigned long profile_pc(struct pt_regs *regs) frame.sp = regs->sp; frame.pc = regs->pc; do { - int ret = unwind_frame(&frame); + int ret = unwind_frame(NULL, &frame); if (ret < 0) return 0; } while (in_lock_functions(frame.pc)); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8a0084541f84..937008523fa5 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -177,7 +177,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) int ret; dump_backtrace_entry(where); - ret = unwind_frame(&frame); + ret = unwind_frame(tsk, &frame); if (ret < 0) break; stack = frame.sp; From 0c078f6a47bb3de392e16788a119d575441c1fb2 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:41 +0900 Subject: [PATCH 0358/3220] UPSTREAM: arm64: ftrace: fix a stack tracer's output under function graph tracer Function graph tracer modifies a return address (LR) in a stack frame to hook a function return. This will result in many useless entries (return_to_handler) showing up in a) a stack tracer's output b) perf call graph (with perf record -g) c) dump_backtrace (at panic et al.) For example, in case of a), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ echo 1 > /proc/sys/kernel/stack_trace_enabled $ cat /sys/kernel/debug/tracing/stack_trace Depth Size Location (54 entries) ----- ---- -------- 0) 4504 16 gic_raise_softirq+0x28/0x150 1) 4488 80 smp_cross_call+0x38/0xb8 2) 4408 48 return_to_handler+0x0/0x40 3) 4360 32 return_to_handler+0x0/0x40 ... In case of b), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ perf record -e mem:XXX:x -ag -- sleep 10 $ perf report ... | | |--0.22%-- 0x550f8 | | | 0x10888 | | | el0_svc_naked | | | sys_openat | | | return_to_handler | | | return_to_handler ... In case of c), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ echo c > /proc/sysrq-trigger ... Call trace: [] sysrq_handle_crash+0x24/0x30 [] return_to_handler+0x0/0x40 [] return_to_handler+0x0/0x40 ... This patch replaces such entries with real addresses preserved in current->ret_stack[] at unwind_frame(). This way, we can cover all the cases. Reviewed-by: Jungseok Lee Signed-off-by: AKASHI Takahiro [will: fixed minor context changes conflicting with irq stack bits] Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 20380bb390a443b2c5c8800cec59743faf8151b4) Signed-off-by: Jeff Vander Stoep Change-Id: I6360182f8d04fdd2e31c0cb6054aefa2adb216e7 --- arch/arm64/include/asm/ftrace.h | 2 ++ arch/arm64/include/asm/stacktrace.h | 3 +++ arch/arm64/kernel/perf_callchain.c | 3 +++ arch/arm64/kernel/process.c | 3 +++ arch/arm64/kernel/return_address.c | 3 +++ arch/arm64/kernel/stacktrace.c | 17 +++++++++++++++++ arch/arm64/kernel/time.c | 3 +++ arch/arm64/kernel/traps.c | 26 ++++++++++++++++++++------ 8 files changed, 54 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index c5534facf941..3c60f37e48ab 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -28,6 +28,8 @@ struct dyn_arch_ftrace { extern unsigned long ftrace_graph_call; +extern void return_to_handler(void); + static inline unsigned long ftrace_call_adjust(unsigned long addr) { /* diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 6fb61c5090b4..801a16dbbdf6 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -22,6 +22,9 @@ struct stackframe { unsigned long fp; unsigned long sp; unsigned long pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned int graph; +#endif }; extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 797220da912b..ff4665462a02 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -164,6 +164,9 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, frame.fp = regs->regs[29]; frame.sp = regs->sp; frame.pc = regs->pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = current->curr_ret_stack; +#endif walk_stackframe(current, &frame, callchain_trace, entry); } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 51be6356fb03..5f6244526e31 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -410,6 +410,9 @@ unsigned long get_wchan(struct task_struct *p) frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = p->curr_ret_stack; +#endif stack_page = (unsigned long)task_stack_page(p); do { if (frame.sp < stack_page || diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 07b37ac05be4..1718706fde83 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -43,6 +43,9 @@ void *return_address(unsigned int level) frame.fp = (unsigned long)__builtin_frame_address(0); frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = current->curr_ret_stack; +#endif walk_stackframe(current, &frame, save_return_addr, &data); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index f7ee597ec883..4fad9787ab46 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -17,6 +17,7 @@ */ #include #include +#include #include #include @@ -66,6 +67,19 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) frame->fp = *(unsigned long *)(fp); frame->pc = *(unsigned long *)(fp + 8); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (tsk && tsk->ret_stack && + (frame->pc == (unsigned long)return_to_handler)) { + /* + * This is a case where function graph tracer has + * modified a return address (LR) in a stack frame + * to hook a function return. + * So replace it to an original value. + */ + frame->pc = tsk->ret_stack[frame->graph--].ret; + } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Check whether we are going to walk through from interrupt stack * to task stack. @@ -158,6 +172,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) frame.sp = current_stack_pointer; frame.pc = (unsigned long)save_stack_trace_tsk; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = tsk->curr_ret_stack; +#endif walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 6e5c521f123a..59779699a1a4 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -52,6 +52,9 @@ unsigned long profile_pc(struct pt_regs *regs) frame.fp = regs->regs[29]; frame.sp = regs->sp; frame.pc = regs->pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = -1; /* no task info */ +#endif do { int ret = unwind_frame(NULL, &frame); if (ret < 0) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 937008523fa5..bdc293f6adc4 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -147,17 +147,14 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + int skip; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); if (!tsk) tsk = current; - if (regs) { - frame.fp = regs->regs[29]; - frame.sp = regs->sp; - frame.pc = regs->pc; - } else if (tsk == current) { + if (tsk == current) { frame.fp = (unsigned long)__builtin_frame_address(0); frame.sp = current_stack_pointer; frame.pc = (unsigned long)dump_backtrace; @@ -169,14 +166,31 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = tsk->curr_ret_stack; +#endif + skip = !!regs; pr_emerg("Call trace:\n"); while (1) { unsigned long where = frame.pc; unsigned long stack; int ret; - dump_backtrace_entry(where); + /* skip until specified stack frame */ + if (!skip) { + dump_backtrace_entry(where); + } else if (frame.fp == regs->regs[29]) { + skip = 0; + /* + * Mostly, this is the case where this function is + * called in panic/abort. As exception handler's + * stack frame does not contain the corresponding pc + * at which an exception has taken place, use regs->pc + * instead. + */ + dump_backtrace_entry(regs->pc); + } ret = unwind_frame(tsk, &frame); if (ret < 0) break; From cc68289965af2e593fdf5161f7e6407d48d79191 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 21 Dec 2015 16:44:27 +0000 Subject: [PATCH 0359/3220] UPSTREAM: arm64: traps: address fallout from printk -> pr_* conversion Commit ac7b406c1a9d ("arm64: Use pr_* instead of printk") was a fairly mindless s/printk/pr_*/ change driven by a complaint from checkpatch. As is usual with such changes, this has led to some odd behaviour on arm64: * syslog now picks up the "pr_emerg" line from dump_backtrace, but not the actual trace, which leads to a bunch of "kernel:Call trace:" lines in the log * __{pte,pmd,pgd}_error print at KERN_CRIT, as opposed to KERN_ERR which is used by other architectures. This patch restores the original printk behaviour for dump_backtrace and downgrade the pgtable error macros to KERN_ERR. Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit c9cd0ed925c0b927283d4739bfe689eb9d1e9dfd) Signed-off-by: Jeff Vander Stoep Change-Id: Iaf028e5368df4623f7257ef432a7f9da86261609 --- arch/arm64/kernel/traps.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index bdc293f6adc4..cbedd724f48e 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -171,7 +171,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) #endif skip = !!regs; - pr_emerg("Call trace:\n"); + printk("Call trace:\n"); while (1) { unsigned long where = frame.pc; unsigned long stack; @@ -482,22 +482,22 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr) void __pte_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pte %016lx.\n", file, line, val); + pr_err("%s:%d: bad pte %016lx.\n", file, line, val); } void __pmd_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pmd %016lx.\n", file, line, val); + pr_err("%s:%d: bad pmd %016lx.\n", file, line, val); } void __pud_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pud %016lx.\n", file, line, val); + pr_err("%s:%d: bad pud %016lx.\n", file, line, val); } void __pgd_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pgd %016lx.\n", file, line, val); + pr_err("%s:%d: bad pgd %016lx.\n", file, line, val); } /* GENERIC_BUG traps */ From 606d0cab2091ee86fa8dc7c77a53a9e59e9fac05 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 5 Jan 2016 15:36:59 +0000 Subject: [PATCH 0360/3220] UPSTREAM: arm64: mm: move pgd_cache initialisation to pgtable_cache_init Initialising the suppport for EFI runtime services requires us to allocate a pgd off the back of an early_initcall. On systems where the PGD_SIZE is smaller than PAGE_SIZE (e.g. 64k pages and 48-bit VA), the pgd_cache isn't initialised at this stage, and we panic with a NULL dereference during boot: Unable to handle kernel NULL pointer dereference at virtual address 00000000 __create_mapping.isra.5+0x84/0x350 create_pgd_mapping+0x20/0x28 efi_create_mapping+0x5c/0x6c arm_enable_runtime_services+0x154/0x1e4 do_one_initcall+0x8c/0x190 kernel_init_freeable+0x84/0x1ec kernel_init+0x10/0xe0 ret_from_fork+0x10/0x50 This patch fixes the problem by initialising the pgd_cache earlier, in the pgtable_cache_init callback, which sounds suspiciously like what it was intended for. Reported-by: Dennis Chen Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 39b5be9b4233a9f212b98242bddf008f379b5122) Signed-off-by: Jeff Vander Stoep Change-Id: I124f03d19299be93124af35641294bf73c13bb22 --- arch/arm64/include/asm/pgtable.h | 3 ++- arch/arm64/mm/pgd.c | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 51f382b7e5c7..69d2e2f86bce 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -680,7 +680,8 @@ extern int kern_addr_valid(unsigned long addr); #include -#define pgtable_cache_init() do { } while (0) +void pgd_cache_init(void); +#define pgtable_cache_init pgd_cache_init /* * On AArch64, the cache coherency is handled via the set_pte_at() function. diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index cb3ba1b812e7..ae11d4e03d0e 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -46,14 +46,14 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } -static int __init pgd_cache_init(void) +void __init pgd_cache_init(void) { + if (PGD_SIZE == PAGE_SIZE) + return; + /* * Naturally aligned pgds required by the architecture. */ - if (PGD_SIZE != PAGE_SIZE) - pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE, - SLAB_PANIC, NULL); - return 0; + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE, + SLAB_PANIC, NULL); } -core_initcall(pgd_cache_init); From d686cf4829c01ca92e8b10603b9250221b3bdfcb Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 5 Jan 2016 17:33:34 +0000 Subject: [PATCH 0361/3220] UPSTREAM: arm64: entry: remove pointless SPSR mode check In work_pending, we may skip work if the stacked SPSR value represents anything other than an EL0 context. We then immediately invoke the kernel_exit 0 macro as part of ret_to_user, assuming a return to EL0. This is somewhat confusing. We use work_pending as part of the ret_to_user/ret_fast_syscall state machine. We only use ret_fast_syscall in the return from an SVC issued from EL0. We use ret_to_user for return from EL0 exception handlers and also for return from ret_from_fork in the case the task was not a kernel thread (i.e. it is a user task). Thus in all cases the stacked SPSR value must represent an EL0 context, and the check is redundant. This patch removes it, along with the now unused no_work_pending label. Cc: Chris Metcalf Acked-by: Catalin Marinas Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit ee03353bc04f8e460cc4e3da80d9721d9ecb89f1) Signed-off-by: Jeff Vander Stoep Change-Id: I2738149342ef469e8cde7c5f8f7c65daab93fb2b --- arch/arm64/kernel/entry.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index c0db321db7e1..1f7f5a2b61bf 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -676,10 +676,7 @@ ret_fast_syscall_trace: work_pending: tbnz x1, #TIF_NEED_RESCHED, work_resched /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */ - ldr x2, [sp, #S_PSTATE] mov x0, sp // 'regs' - tst x2, #PSR_MODE_MASK // user mode regs? - b.ne no_work_pending // returning to kernel enable_irq // enable interrupts for do_notify_resume() bl do_notify_resume b ret_to_user @@ -698,7 +695,6 @@ ret_to_user: and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending enable_step_tsk x1, x2 -no_work_pending: kernel_exit 0 ENDPROC(ret_to_user) From 3c31c6d7098894f0bf886ebe711fd930855626e5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 23 Dec 2015 10:29:28 +0100 Subject: [PATCH 0362/3220] UPSTREAM: efi: stub: define DISABLE_BRANCH_PROFILING for all architectures This moves the DISABLE_BRANCH_PROFILING define from the x86 specific to the general CFLAGS definition for the stub. This fixes build errors when building for arm64 with CONFIG_PROFILE_ALL_BRANCHES_ENABLED. Reviewed-by: Matt Fleming Reported-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit b523e185bba36164ca48a190f5468c140d815414) Signed-off-by: Jeff Vander Stoep Change-Id: Id5a2a3986adc60ddf3d247c038667ce9719f3ee0 --- drivers/firmware/efi/libstub/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 3c0467d3688c..c0ddd1b8dca3 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -8,7 +8,7 @@ cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ -fPIC -fno-strict-aliasing -mno-red-zone \ - -mno-mmx -mno-sse -DDISABLE_BRANCH_PROFILING + -mno-mmx -mno-sse cflags-$(CONFIG_ARM64) := $(subst -pg,,$(KBUILD_CFLAGS)) cflags-$(CONFIG_ARM) := $(subst -pg,,$(KBUILD_CFLAGS)) \ @@ -16,7 +16,7 @@ cflags-$(CONFIG_ARM) := $(subst -pg,,$(KBUILD_CFLAGS)) \ cflags-$(CONFIG_EFI_ARMSTUB) += -I$(srctree)/scripts/dtc/libfdt -KBUILD_CFLAGS := $(cflags-y) \ +KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \ $(call cc-option,-ffreestanding) \ $(call cc-option,-fno-stack-protector) From a725004a74ab543510a8158b206c334756e9d8cf Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 6 Jan 2016 11:05:27 +0000 Subject: [PATCH 0363/3220] UPSTREAM: arm64: head.S: use memset to clear BSS Currently we use an open-coded memzero to clear the BSS. As it is a trivial implementation, it is sub-optimal. Our optimised memset doesn't use the stack, is position-independent, and for the memzero case can use of DC ZVA to clear large blocks efficiently. In __mmap_switched the MMU is on and there are no live caller-saved registers, so we can safely call an uninstrumented memset. This patch changes __mmap_switched to use memset when clearing the BSS. We use the __pi_memset alias so as to avoid any instrumentation in all kernel configurations. Cc: Catalin Marinas Cc: Marc Zyngier Reviewed-by: Ard Biesheuvel Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 2a803c4db615d85126c5c7afd5849a3cfde71422) Signed-off-by: Jeff Vander Stoep Change-Id: I3dc7050fe5566f2126cbea9abfa6063c8e6b029a --- arch/arm64/kernel/head.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 17ce7285bb12..917d98108b3f 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -415,14 +415,13 @@ ENDPROC(__create_page_tables) */ .set initial_sp, init_thread_union + THREAD_START_SP __mmap_switched: - adr_l x6, __bss_start - adr_l x7, __bss_stop + // Clear BSS + adr_l x0, __bss_start + mov x1, xzr + adr_l x2, __bss_stop + sub x2, x2, x0 + bl __pi_memset -1: cmp x6, x7 - b.hs 2f - str xzr, [x6], #8 // Clear BSS - b 1b -2: adr_l sp, initial_sp, x4 mov x4, sp and x4, x4, #~(THREAD_SIZE - 1) From 66a58725b83cdc0754f0329465f7169aa8417e3d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 15 Jan 2016 13:28:57 +0100 Subject: [PATCH 0364/3220] UPSTREAM: arm64: hide __efistub_ aliases from kallsyms Commit e8f3010f7326 ("arm64/efi: isolate EFI stub from the kernel proper") isolated the EFI stub code from the kernel proper by prefixing all of its symbols with __efistub_, and selectively allowing access to core kernel symbols from the stub by emitting __efistub_ aliases for functions and variables that the stub can access legally. As an unintended side effect, these aliases are emitted into the kallsyms symbol table, which means they may turn up in backtraces, e.g., ... PC is at __efistub_memset+0x108/0x200 LR is at fixup_init+0x3c/0x48 ... [] __efistub_memset+0x108/0x200 [] free_initmem+0x2c/0x40 [] kernel_init+0x20/0xe0 [] ret_from_fork+0x10/0x40 The backtrace in question has nothing to do with the EFI stub, but simply returns one of the several aliases of memset() that have been recorded in the kallsyms table. This is undesirable, since it may suggest to people who are not aware of this that the issue they are seeing is somehow EFI related. So hide the __efistub_ aliases from kallsyms, by emitting them as absolute linker symbols explicitly. The distinction between those and section relative symbols is completely irrelevant to these definitions, and to the final link we are performing when these definitions are being taken into account (the distinction is only relevant to symbols defined inside a section definition when performing a partial link), and so the resulting values are identical to the original ones. Since absolute symbols are ignored by kallsyms, this will result in these values to be omitted from its symbol table. After this patch, the backtrace generated from the same address looks like this: ... PC is at __memset+0x108/0x200 LR is at fixup_init+0x3c/0x48 ... [] __memset+0x108/0x200 [] free_initmem+0x2c/0x40 [] kernel_init+0x20/0xe0 [] ret_from_fork+0x10/0x40 Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 75feee3d9d51775072d3a04f47d4a439a4c4590e) Signed-off-by: Jeff Vander Stoep Change-Id: Ieeb8c576e31f0dd0b7f82982ffa6362864eed311 --- arch/arm64/kernel/image.h | 40 ++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index bc2abb8b1599..999633bd7294 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -64,6 +64,16 @@ #ifdef CONFIG_EFI +/* + * Prevent the symbol aliases below from being emitted into the kallsyms + * table, by forcing them to be absolute symbols (which are conveniently + * ignored by scripts/kallsyms) rather than section relative symbols. + * The distinction is only relevant for partial linking, and only for symbols + * that are defined within a section declaration (which is not the case for + * the definitions below) so the resulting values will be identical. + */ +#define KALLSYMS_HIDE(sym) ABSOLUTE(sym) + /* * The EFI stub has its own symbol namespace prefixed by __efistub_, to * isolate it from the kernel proper. The following symbols are legally @@ -73,25 +83,25 @@ * linked at. The routines below are all implemented in assembler in a * position independent manner */ -__efistub_memcmp = __pi_memcmp; -__efistub_memchr = __pi_memchr; -__efistub_memcpy = __pi_memcpy; -__efistub_memmove = __pi_memmove; -__efistub_memset = __pi_memset; -__efistub_strlen = __pi_strlen; -__efistub_strcmp = __pi_strcmp; -__efistub_strncmp = __pi_strncmp; -__efistub___flush_dcache_area = __pi___flush_dcache_area; +__efistub_memcmp = KALLSYMS_HIDE(__pi_memcmp); +__efistub_memchr = KALLSYMS_HIDE(__pi_memchr); +__efistub_memcpy = KALLSYMS_HIDE(__pi_memcpy); +__efistub_memmove = KALLSYMS_HIDE(__pi_memmove); +__efistub_memset = KALLSYMS_HIDE(__pi_memset); +__efistub_strlen = KALLSYMS_HIDE(__pi_strlen); +__efistub_strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub_strncmp = KALLSYMS_HIDE(__pi_strncmp); +__efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); #ifdef CONFIG_KASAN -__efistub___memcpy = __pi_memcpy; -__efistub___memmove = __pi_memmove; -__efistub___memset = __pi_memset; +__efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); +__efistub___memmove = KALLSYMS_HIDE(__pi_memmove); +__efistub___memset = KALLSYMS_HIDE(__pi_memset); #endif -__efistub__text = _text; -__efistub__end = _end; -__efistub__edata = _edata; +__efistub__text = KALLSYMS_HIDE(_text); +__efistub__end = KALLSYMS_HIDE(_end); +__efistub__edata = KALLSYMS_HIDE(_edata); #endif From a84dda4e8627f7b6a91a5c2cd17e4cd059ac9cbf Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 15 Jan 2016 16:55:33 -0800 Subject: [PATCH 0365/3220] UPSTREAM: arch/arm/include/asm/pgtable-3level.h: add pmd_mkclean for THP MADV_FREE needs pmd_dirty and pmd_mkclean for detecting recent overwrite of the contents since MADV_FREE syscall is called for THP page. This patch adds pmd_mkclean for THP page MADV_FREE support. Signed-off-by: Minchan Kim Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: "James E.J. Bottomley" Cc: "Kirill A. Shutemov" Cc: Shaohua Li Cc: Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Chen Gang Cc: Chris Zankel Cc: Daniel Micay Cc: Darrick J. Wong Cc: David S. Miller Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: Jason Evans Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Kirill A. Shutemov Cc: Matt Turner Cc: Max Filippov Cc: Mel Gorman Cc: Michael Kerrisk Cc: Michal Hocko Cc: Mika Penttil Cc: Ralf Baechle Cc: Richard Henderson Cc: Rik van Riel Cc: Roland Dreier Cc: Shaohua Li Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 44842045e4baaf406db2954dd2e07152fa61528d) Signed-off-by: Jeff Vander Stoep Change-Id: I59d53667aa8c40dea4f18fc58acc7d27f4a85a04 --- arch/arm/include/asm/pgtable-3level.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index a745a2a53853..6d6012a320b2 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -249,6 +249,7 @@ PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF); PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING); PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY); PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY); +PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY); PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) From fd89b55b8391791a0cd49258a8922b4982d19f47 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 14:50:21 +0100 Subject: [PATCH 0366/3220] UPSTREAM: arm64: kasan: ensure that the KASAN zero page is mapped read-only When switching from the early KASAN shadow region, which maps the entire shadow space read-write, to the permanent KASAN shadow region, which uses a zero page to shadow regions that are not subject to instrumentation, the lowest level table kasan_zero_pte[] may be reused unmodified, which means that the mappings of the zero page that it contains will still be read-write. So update it explicitly to map the zero page read only when we activate the permanent mapping. Acked-by: Andrey Ryabinin Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 7b1af9795773d745c2a8c7d4ca5f2936e8b6adfb) Signed-off-by: Jeff Vander Stoep Change-Id: I4745dc91236c2612ad3e13be1cc176ffd923f7da --- arch/arm64/mm/kasan_init.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index cf038c7d9fa9..cab7a5be40aa 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -120,6 +120,7 @@ static void __init cpu_set_ttbr1(unsigned long ttbr1) void __init kasan_init(void) { struct memblock_region *reg; + int i; /* * We are going to perform proper setup of shadow memory. @@ -155,6 +156,14 @@ void __init kasan_init(void) pfn_to_nid(virt_to_pfn(start))); } + /* + * KAsan may reuse the contents of kasan_zero_pte directly, so we + * should make sure that it maps the zero page read-only. + */ + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(&kasan_zero_pte[i], + pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO)); + memset(kasan_zero_page, 0, PAGE_SIZE); cpu_set_ttbr1(__pa(swapper_pg_dir)); flush_tlb_all(); From da4712caec8748615aa17eeb9d6c3cd3b0f4f910 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 24 Jan 2016 15:24:12 +0900 Subject: [PATCH 0367/3220] UPSTREAM: arm64: Fix an enum typo in mm/dump.c This patch fixes a typo in mm/dump.c: "MODUELS_END_NR" should be "MODULES_END_NR". Signed-off-by: Masanari Iida Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit b3122023df935cf14bf951da98ca598d71b9f826) Signed-off-by: Jeff Vander Stoep Change-Id: I5e73d18fd70f20f200a974f5f2ba22cb7ae64952 --- arch/arm64/mm/dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 5a22a119a74c..0adbebbc2803 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -46,7 +46,7 @@ enum address_markers_idx { PCI_START_NR, PCI_END_NR, MODULES_START_NR, - MODUELS_END_NR, + MODULES_END_NR, KERNEL_SPACE_NR, }; From 720345213019ec25bc43f6b78592f4c39e2c1731 Mon Sep 17 00:00:00 2001 From: William Cohen Date: Thu, 21 Jan 2016 22:56:26 -0500 Subject: [PATCH 0368/3220] BACKPORT: Eliminate the .eh_frame sections from the aarch64 vmlinux and kernel modules By default the aarch64 gcc generates .eh_frame sections. Unlike .debug_frame sections, the .eh_frame sections are loaded into memory when the associated code is loaded. On an example kernel being built with this default the .eh_frame section in vmlinux used an extra 1.7MB of memory. The x86 disables the creation of the .eh_frame section. The aarch64 should probably do the same to save some memory. Signed-off-by: William Cohen Signed-off-by: Will Deacon Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 728dabd6d1751cf5e0f8e0535891393da62396e9) Signed-off-by: Jeff Vander Stoep Change-Id: I697baae2209d6d11f2cc447459d935f7200eb7b1 --- arch/arm64/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 4e77fd165f38..693a4dc6220c 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -28,6 +28,7 @@ endif KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) KBUILD_CFLAGS += -fno-pic +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_AFLAGS += $(lseinstr) ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) From 81d07628d337df8070830a82fbca2f52015e0ebd Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:55 +0000 Subject: [PATCH 0369/3220] UPSTREAM: asm-generic: Fix local variable shadow in __set_fixmap_offset Currently __set_fixmap_offset is a macro function which has a local variable called 'addr'. If a caller passes a 'phys' parameter which is derived from a variable also called 'addr', the local variable will shadow this, and the compiler will complain about the use of an uninitialized variable. To avoid the issue with namespace clashes, 'addr' is prefixed with a liberal sprinkling of underscores. Turning __set_fixmap_offset into a static inline breaks the build for several architectures. Fixing this properly requires updates to a number of architectures to make them agree on the prototype of __set_fixmap (it could be done as a subsequent patch series). Signed-off-by: Mark Rutland Cc: Arnd Bergmann [catalin.marinas@arm.com: squashed the original function patch and macro fixup] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 3694bd76781b76c4f8d2ecd85018feeb1609f0e5) Signed-off-by: Jeff Vander Stoep Change-Id: Iec27cb36dfca39e333de9f7318e76da0670d0156 --- include/asm-generic/fixmap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h index 1cbb8338edf3..827e4d3bbc7a 100644 --- a/include/asm-generic/fixmap.h +++ b/include/asm-generic/fixmap.h @@ -70,12 +70,12 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr) #endif /* Return a pointer with offset calculated */ -#define __set_fixmap_offset(idx, phys, flags) \ -({ \ - unsigned long addr; \ - __set_fixmap(idx, phys, flags); \ - addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1)); \ - addr; \ +#define __set_fixmap_offset(idx, phys, flags) \ +({ \ + unsigned long ________addr; \ + __set_fixmap(idx, phys, flags); \ + ________addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1)); \ + ________addr; \ }) #define set_fixmap_offset(idx, phys) \ From 0c8db8d0d4c615dd53b703fd632d4481e2bb18d6 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:56 +0000 Subject: [PATCH 0370/3220] UPSTREAM: arm64: mm: specialise pagetable allocators We pass a size parameter to early_alloc and late_alloc, but these are only ever used to allocate single pages. In late_alloc we always allocate a single page. Both allocators provide us with zeroed pages (such that all entries are invalid), but we have no barriers between allocating a page and adding that page to existing (live) tables. A concurrent page table walk may see stale data, leading to a number of issues. This patch specialises the two allocators for page tables. The size parameter is removed and the necessary dsb(ishst) is folded into each. To make it clear that the functions are intended for use for page table allocation, they are renamed to {early,late}_pgtable_alloc, with the related function pointed renamed to pgtable_alloc. As the dsb(ishst) is now in the allocator, the existing barrier for the zero page is redundant and thus is removed. The previously missing include of barrier.h is added. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 21ab99c289d350f4ae454bc069870009db6df20e) Signed-off-by: Jeff Vander Stoep Change-Id: Ib88fafc146506943122aa1ca6ee5a6c331ddb26c --- arch/arm64/mm/mmu.c | 49 +++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 7b6e58a58bf3..ad8c076d0aef 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -62,15 +63,18 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static void __init *early_alloc(unsigned long sz) +static void __init *early_pgtable_alloc(void) { phys_addr_t phys; void *ptr; - phys = memblock_alloc(sz, sz); + phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE); BUG_ON(!phys); ptr = __va(phys); - memset(ptr, 0, sz); + memset(ptr, 0, PAGE_SIZE); + + /* Ensure the zeroed page is visible to the page table walker */ + dsb(ishst); return ptr; } @@ -95,12 +99,12 @@ static void split_pmd(pmd_t *pmd, pte_t *pte) static void alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { pte_t *pte; if (pmd_none(*pmd) || pmd_sect(*pmd)) { - pte = alloc(PTRS_PER_PTE * sizeof(pte_t)); + pte = pgtable_alloc(); if (pmd_sect(*pmd)) split_pmd(pmd, pte); __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE); @@ -130,7 +134,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { pmd_t *pmd; unsigned long next; @@ -139,7 +143,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, * Check for initial section mappings in the pgd/pud and remove them. */ if (pud_none(*pud) || pud_sect(*pud)) { - pmd = alloc(PTRS_PER_PMD * sizeof(pmd_t)); + pmd = pgtable_alloc(); if (pud_sect(*pud)) { /* * need to have the 1G of mappings continue to be @@ -174,7 +178,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, } } else { alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), - prot, alloc); + prot, pgtable_alloc); } phys += next - addr; } while (pmd++, addr = next, addr != end); @@ -195,13 +199,13 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { pud_t *pud; unsigned long next; if (pgd_none(*pgd)) { - pud = alloc(PTRS_PER_PUD * sizeof(pud_t)); + pud = pgtable_alloc(); pgd_populate(mm, pgd, pud); } BUG_ON(pgd_bad(*pgd)); @@ -234,7 +238,8 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, } } } else { - alloc_init_pmd(mm, pud, addr, next, phys, prot, alloc); + alloc_init_pmd(mm, pud, addr, next, phys, prot, + pgtable_alloc); } phys += next - addr; } while (pud++, addr = next, addr != end); @@ -247,7 +252,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { unsigned long addr, length, end, next; @@ -265,18 +270,18 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, end = addr + length; do { next = pgd_addr_end(addr, end); - alloc_init_pud(mm, pgd, addr, next, phys, prot, alloc); + alloc_init_pud(mm, pgd, addr, next, phys, prot, pgtable_alloc); phys += next - addr; } while (pgd++, addr = next, addr != end); } -static void *late_alloc(unsigned long size) +static void *late_pgtable_alloc(void) { - void *ptr; - - BUG_ON(size > PAGE_SIZE); - ptr = (void *)__get_free_page(PGALLOC_GFP); + void *ptr = (void *)__get_free_page(PGALLOC_GFP); BUG_ON(!ptr); + + /* Ensure the zeroed page is visible to the page table walker */ + dsb(ishst); return ptr; } @@ -289,7 +294,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, return; } __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, - size, prot, early_alloc); + size, prot, early_pgtable_alloc); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, @@ -297,7 +302,7 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgprot_t prot) { __create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot, - late_alloc); + late_pgtable_alloc); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, @@ -310,7 +315,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, } return __create_mapping(&init_mm, pgd_offset_k(virt), - phys, virt, size, prot, late_alloc); + phys, virt, size, prot, late_pgtable_alloc); } #ifdef CONFIG_DEBUG_RODATA @@ -457,7 +462,7 @@ void __init paging_init(void) fixup_executable(); /* allocate the zero page. */ - zero_page = early_alloc(PAGE_SIZE); + zero_page = early_pgtable_alloc(); bootmem_init(); From 2563a08010f2a945a39c19853297ed980c8d469f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:57 +0000 Subject: [PATCH 0371/3220] UPSTREAM: arm64: mm: place empty_zero_page in bss Currently the zero page is set up in paging_init, and thus we cannot use the zero page earlier. We use the zero page as a reserved TTBR value from which no TLB entries may be allocated (e.g. when uninstalling the idmap). To enable such usage earlier (as may be required for invasive changes to the kernel page tables), and to minimise the time that the idmap is active, we need to be able to use the zero page before paging_init. This patch follows the example set by x86, by allocating the zero page at compile time, in .bss. This means that the zero page itself is available immediately upon entry to start_kernel (as we zero .bss before this), and also means that the zero page takes up no space in the raw Image binary. The associated struct page is allocated in bootmem_init, and remains unavailable until this time. Outside of arch code, the only users of empty_zero_page assume that the empty_zero_page symbol refers to the zeroed memory itself, and that ZERO_PAGE(x) must be used to acquire the associated struct page, following the example of x86. This patch also brings arm64 inline with these assumptions. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 5227cfa71f9e8574373f4d0e9e754942d76cdf67) Signed-off-by: Jeff Vander Stoep Change-Id: I43005dd8533d9968d2cef48bd2821d4507cae5ad --- arch/arm64/include/asm/mmu_context.h | 2 +- arch/arm64/include/asm/pgtable.h | 4 ++-- arch/arm64/kernel/head.S | 1 + arch/arm64/mm/mmu.c | 9 +-------- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 24165784b803..600eacb9f7d5 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -48,7 +48,7 @@ static inline void contextidr_thread_switch(struct task_struct *next) */ static inline void cpu_set_reserved_ttbr0(void) { - unsigned long ttbr = page_to_phys(empty_zero_page); + unsigned long ttbr = virt_to_phys(empty_zero_page); asm( " msr ttbr0_el1, %0 // set TTBR0\n" diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 69d2e2f86bce..e11dc90fbf56 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -121,8 +121,8 @@ extern void __pgd_error(const char *file, int line, unsigned long val); * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern struct page *empty_zero_page; -#define ZERO_PAGE(vaddr) (empty_zero_page) +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) #define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte)) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 917d98108b3f..53b9f9f128c2 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -421,6 +421,7 @@ __mmap_switched: adr_l x2, __bss_stop sub x2, x2, x0 bl __pi_memset + dsb ishst // Make zero page visible to PTW adr_l sp, initial_sp, x4 mov x4, sp diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ad8c076d0aef..acf0442b9759 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -49,7 +49,7 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS); * Empty_zero_page is a special page that is used for zero-initialized data * and COW. */ -struct page *empty_zero_page; +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, @@ -456,18 +456,11 @@ void fixup_init(void) */ void __init paging_init(void) { - void *zero_page; - map_mem(); fixup_executable(); - /* allocate the zero page. */ - zero_page = early_pgtable_alloc(); - bootmem_init(); - empty_zero_page = virt_to_page(zero_page); - /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. From c66ef5f947144ca420c7483226e3ac64ad8fd813 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:58 +0000 Subject: [PATCH 0372/3220] UPSTREAM: arm64: unify idmap removal We currently open-code the removal of the idmap and restoration of the current task's MMU state in a few places. Before introducing yet more copies of this sequence, unify these to call a new helper, cpu_uninstall_idmap. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Lorenzo Pieralisi Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 9e8e865bbe294a69666a1996bda3e87825b258c0) Signed-off-by: Jeff Vander Stoep Change-Id: I6e9cb0253a1d2d63232f8fa0b3f39f8f6987b239 --- arch/arm64/include/asm/mmu_context.h | 25 +++++++++++++++++++++++++ arch/arm64/kernel/setup.c | 1 + arch/arm64/kernel/smp.c | 4 +--- arch/arm64/kernel/suspend.c | 20 ++++---------------- arch/arm64/mm/mmu.c | 4 +--- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 600eacb9f7d5..b1b2514d8883 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -27,6 +27,7 @@ #include #include #include +#include #ifdef CONFIG_PID_IN_CONTEXTIDR static inline void contextidr_thread_switch(struct task_struct *next) @@ -89,6 +90,30 @@ static inline void cpu_set_default_tcr_t0sz(void) : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); } +/* + * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm. + * + * The idmap lives in the same VA range as userspace, but uses global entries + * and may use a different TCR_EL1.T0SZ. To avoid issues resulting from + * speculative TLB fetches, we must temporarily install the reserved page + * tables while we invalidate the TLBs and set up the correct TCR_EL1.T0SZ. + * + * If current is a not a user task, the mm covers the TTBR1_EL1 page tables, + * which should not be installed in TTBR0_EL1. In this case we can leave the + * reserved page tables in place. + */ +static inline void cpu_uninstall_idmap(void) +{ + struct mm_struct *mm = current->active_mm; + + cpu_set_reserved_ttbr0(); + local_flush_tlb_all(); + cpu_set_default_tcr_t0sz(); + + if (mm != &init_mm) + cpu_switch_mm(mm->pgd, mm); +} + /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 7f0e7226795e..de8e187ec8c6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -62,6 +62,7 @@ #include #include #include +#include phys_addr_t __fdt_pointer __initdata; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index b1adc51b2c2e..68e7f79630d4 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -149,9 +149,7 @@ asmlinkage void secondary_start_kernel(void) * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - cpu_set_default_tcr_t0sz(); + cpu_uninstall_idmap(); preempt_disable(); trace_hardirqs_off(); diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 1095aa483a1c..66055392f445 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -60,7 +60,6 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *)) */ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) { - struct mm_struct *mm = current->active_mm; int ret; unsigned long flags; @@ -87,22 +86,11 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) ret = __cpu_suspend_enter(arg, fn); if (ret == 0) { /* - * We are resuming from reset with TTBR0_EL1 set to the - * idmap to enable the MMU; set the TTBR0 to the reserved - * page tables to prevent speculative TLB allocations, flush - * the local tlb and set the default tcr_el1.t0sz so that - * the TTBR0 address space set-up is properly restored. - * If the current active_mm != &init_mm we entered cpu_suspend - * with mappings in TTBR0 that must be restored, so we switch - * them back to complete the address space configuration - * restoration before returning. + * We are resuming from reset with the idmap active in TTBR0_EL1. + * We must uninstall the idmap and restore the expected MMU + * state before we can possibly return to userspace. */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - cpu_set_default_tcr_t0sz(); - - if (mm != &init_mm) - cpu_switch_mm(mm->pgd, mm); + cpu_uninstall_idmap(); /* * Restore per-cpu offset before any kernel diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index acf0442b9759..b00462172705 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -465,9 +465,7 @@ void __init paging_init(void) * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - cpu_set_default_tcr_t0sz(); + cpu_uninstall_idmap(); } /* From 8ac5abfecf99f0c1c065c83a0ff7481d29173e03 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:59 +0000 Subject: [PATCH 0373/3220] UPSTREAM: arm64: unmap idmap earlier During boot we leave the idmap in place until paging_init, as we previously had to wait for the zero page to become allocated and accessible. Now that we have a statically-allocated zero page, we can uninstall the idmap much earlier in the boot process, making it far easier to spot accidental use of physical addresses. This also brings the cold boot path in line with the secondary boot path. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 86ccce896cb0aa800a7a6dcd29b41ffc4eeb1a75) Signed-off-by: Jeff Vander Stoep Change-Id: I6375bd9855e45727790697875b7cd19f84a4dd7f --- arch/arm64/kernel/setup.c | 6 ++++++ arch/arm64/mm/mmu.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index de8e187ec8c6..079db3ba21c4 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -314,6 +314,12 @@ void __init setup_arch(char **cmdline_p) */ local_async_enable(); + /* + * TTBR0 is only used for the identity mapping at this stage. Make it + * point to zero page to avoid speculatively fetching new entries. + */ + cpu_uninstall_idmap(); + efi_init(); arm64_memblock_init(); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b00462172705..c0a4160331f8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -460,12 +460,6 @@ void __init paging_init(void) fixup_executable(); bootmem_init(); - - /* - * TTBR0 is only used for the identity mapping at this stage. Make it - * point to zero page to avoid speculatively fetching new entries. - */ - cpu_uninstall_idmap(); } /* From 4ba7b986cf3bbc2e3e66f2a7805356a30f3b203c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:00 +0000 Subject: [PATCH 0374/3220] UPSTREAM: arm64: add function to install the idmap In some cases (e.g. when making invasive changes to the kernel page tables) we will need to execute code from the idmap. Add a new helper which may be used to install the idmap, complementing the existing cpu_uninstall_idmap. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 609116d202a8c5fd3fe393eb85373cbee906df68) Signed-off-by: Jeff Vander Stoep Change-Id: I1aac9e85e3d49add72c8aab7d80777a39f5fdd8e --- arch/arm64/include/asm/mmu_context.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index b1b2514d8883..944f2730a940 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -74,7 +74,7 @@ static inline bool __cpu_uses_extended_idmap(void) /* * Set TCR.T0SZ to its default value (based on VA_BITS) */ -static inline void cpu_set_default_tcr_t0sz(void) +static inline void __cpu_set_tcr_t0sz(unsigned long t0sz) { unsigned long tcr; @@ -87,9 +87,12 @@ static inline void cpu_set_default_tcr_t0sz(void) " msr tcr_el1, %0 ;" " isb" : "=&r" (tcr) - : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); + : "r"(t0sz), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); } +#define cpu_set_default_tcr_t0sz() __cpu_set_tcr_t0sz(TCR_T0SZ(VA_BITS)) +#define cpu_set_idmap_tcr_t0sz() __cpu_set_tcr_t0sz(idmap_t0sz) + /* * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm. * @@ -114,6 +117,15 @@ static inline void cpu_uninstall_idmap(void) cpu_switch_mm(mm->pgd, mm); } +static inline void cpu_install_idmap(void) +{ + cpu_set_reserved_ttbr0(); + local_flush_tlb_all(); + cpu_set_idmap_tcr_t0sz(); + + cpu_switch_mm(idmap_pg_dir, &init_mm); +} + /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously From 214ae8984d113452991df67aa2820a28755219c3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:01 +0000 Subject: [PATCH 0375/3220] UPSTREAM: arm64: mm: add code to safely replace TTBR1_EL1 If page tables are modified without suitable TLB maintenance, the ARM architecture permits multiple TLB entries to be allocated for the same VA. When this occurs, it is permitted that TLB conflict aborts are raised in response to synchronous data/instruction accesses, and/or and amalgamation of the TLB entries may be used as a result of a TLB lookup. The presence of conflicting TLB entries may result in a variety of behaviours detrimental to the system (e.g. erroneous physical addresses may be used by I-cache fetches and/or page table walks). Some of these cases may result in unexpected changes of hardware state, and/or result in the (asynchronous) delivery of SError. To avoid these issues, we must avoid situations where conflicting entries may be allocated into TLBs. For user and module mappings we can follow a strict break-before-make approach, but this cannot work for modifications to the swapper page tables that cover the kernel text and data. Instead, this patch adds code which is intended to be executed from the idmap, which can safely unmap the swapper page tables as it only requires the idmap to be active. This enables us to uninstall the active TTBR1_EL1 entry, invalidate TLBs, then install a new TTBR1_EL1 entry without potentially unmapping code or data required for the sequence. This avoids the risk of conflict, but requires that updates are staged in a copy of the swapper page tables prior to being installed. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 50e1881ddde2a986c7d0d2150985239e5e3d7d96) Signed-off-by: Jeff Vander Stoep Change-Id: Ibd0a7c802a1abe0ba08a99819fd62ac646186005 --- arch/arm64/include/asm/mmu_context.h | 19 +++++++++++++++++++ arch/arm64/mm/proc.S | 28 ++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 944f2730a940..a00f7cf35bbd 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -126,6 +126,25 @@ static inline void cpu_install_idmap(void) cpu_switch_mm(idmap_pg_dir, &init_mm); } +/* + * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, + * avoiding the possibility of conflicting TLB entries being allocated. + */ +static inline void cpu_replace_ttbr1(pgd_t *pgd) +{ + typedef void (ttbr_replace_func)(phys_addr_t); + extern ttbr_replace_func idmap_cpu_replace_ttbr1; + ttbr_replace_func *replace_phys; + + phys_addr_t pgd_phys = virt_to_phys(pgd); + + replace_phys = (void *)virt_to_phys(idmap_cpu_replace_ttbr1); + + cpu_install_idmap(); + replace_phys(pgd_phys); + cpu_uninstall_idmap(); +} + /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index c164d2cb35c0..0c19534a901e 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -140,6 +140,34 @@ ENTRY(cpu_do_switch_mm) ret ENDPROC(cpu_do_switch_mm) + .pushsection ".idmap.text", "ax" +/* + * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd) + * + * This is the low-level counterpart to cpu_replace_ttbr1, and should not be + * called by anything else. It can only be executed from a TTBR0 mapping. + */ +ENTRY(idmap_cpu_replace_ttbr1) + mrs x2, daif + msr daifset, #0xf + + adrp x1, empty_zero_page + msr ttbr1_el1, x1 + isb + + tlbi vmalle1 + dsb nsh + isb + + msr ttbr1_el1, x0 + isb + + msr daif, x2 + + ret +ENDPROC(idmap_cpu_replace_ttbr1) + .popsection + /* * __cpu_setup * From 0991212c3218e24c047ce32fa462931a5a53024e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:02 +0000 Subject: [PATCH 0376/3220] UPSTREAM: arm64: kasan: avoid TLB conflicts The page table modification performed during the KASAN init risks the allocation of conflicting TLB entries, as it swaps a set of valid global entries for another without suitable TLB maintenance. The presence of conflicting TLB entries can result in the delivery of synchronous TLB conflict aborts, or may result in the use of erroneous data being returned in response to a TLB lookup. This can affect explicit data accesses from software as well as translations performed asynchronously (e.g. as part of page table walks or speculative I-cache fetches), and can therefore result in a wide variety of problems. To avoid this, use cpu_replace_ttbr1 to swap the page tables. This ensures that when the new tables are installed there are no stale entries from the old tables which may conflict. As all updates are made to the tables while they are not active, the updates themselves are safe. At the same time, add the missing barrier to ensure that the tmp_pg_dir entries updated via memcpy are visible to the page table walkers at the point the tmp_pg_dir is installed. All other page table updates made as part of KASAN initialisation have the requisite barriers due to the use of the standard page table accessors. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Cc: Andrey Ryabinin Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit c1a88e9124a499939ebd8069d5e4d3937f019157) Signed-off-by: Jeff Vander Stoep Change-Id: I0934b236e25e015d6946dcedcca14ba9512418f7 --- arch/arm64/mm/kasan_init.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index cab7a5be40aa..263b59020500 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -108,15 +109,6 @@ static void __init clear_pgds(unsigned long start, set_pgd(pgd_offset_k(start), __pgd(0)); } -static void __init cpu_set_ttbr1(unsigned long ttbr1) -{ - asm( - " msr ttbr1_el1, %0\n" - " isb" - : - : "r" (ttbr1)); -} - void __init kasan_init(void) { struct memblock_region *reg; @@ -130,8 +122,8 @@ void __init kasan_init(void) * setup will be finished. */ memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir)); - cpu_set_ttbr1(__pa(tmp_pg_dir)); - flush_tlb_all(); + dsb(ishst); + cpu_replace_ttbr1(tmp_pg_dir); clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); @@ -165,8 +157,7 @@ void __init kasan_init(void) pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO)); memset(kasan_zero_page, 0, PAGE_SIZE); - cpu_set_ttbr1(__pa(swapper_pg_dir)); - flush_tlb_all(); + cpu_replace_ttbr1(swapper_pg_dir); /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; From ded0130fa93036b518b16b5af7ef7c318b67e3e4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:03 +0000 Subject: [PATCH 0377/3220] UPSTREAM: arm64: mm: move pte_* macros For pmd, pud, and pgd levels of table, functions including p?d_index and p?d_offset are defined after the p?d_page_vaddr function for the immediately higher level of table. The pte functions however are defined much earlier, even though several rely on the later definition of pmd_page_vaddr. While this isn't currently a problem as these are macros, it prevents the logical grouping of later C functions (which cannot rely on prototypes for functions not yet defined). Move these definitions after pmd_page_vaddr, for consistency with the placement of these functions for other levels of table. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 053520f7d3923cc6d37afb28f9887cb1e7d77454) Signed-off-by: Jeff Vander Stoep Change-Id: I44c7327dc0f257188c0c2204ec7da0e7fdca7f64 --- arch/arm64/include/asm/pgtable.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e11dc90fbf56..1d55520c3561 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -134,16 +134,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0)) #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) -/* Find an entry in the third-level page table. */ -#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) - -#define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + pte_index(addr)) - -#define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) - /* * The following only work if pte_present(). Undefined behaviour otherwise. */ @@ -445,6 +435,16 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK); } +/* Find an entry in the third-level page table. */ +#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + +#define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + pte_index(addr)) + +#define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) +#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) + #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) /* From 4d41b2a8a8412b07033ddcd56cc0b31b38889cdb Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:04 +0000 Subject: [PATCH 0378/3220] UPSTREAM: arm64: mm: add functions to walk page tables by PA To allow us to walk tables allocated into the fixmap, we need to acquire the physical address of a page, rather than the virtual address in the linear map. This patch adds new p??_page_paddr and p??_offset_phys functions to acquire the physical address of a next-level table, and changes p??_offset* into macros which simply convert this to a linear map VA. This renders p??_page_vaddr unused, and hence they are removed. At the pgd level, a new pgd_offset_raw function is added to find the relevant PGD entry given the base of a PGD and a virtual address. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit dca56dca7124709f3dfca81afe61b4d98eb9cacf) Signed-off-by: Jeff Vander Stoep Change-Id: Ie43d00014322e29aec70617db3af242ed8545738 --- arch/arm64/include/asm/pgtable.h | 39 +++++++++++++++++++------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1d55520c3561..837fc7b8a9a4 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -430,15 +430,16 @@ static inline void pmd_clear(pmd_t *pmdp) set_pmd(pmdp, __pmd(0)); } -static inline pte_t *pmd_page_vaddr(pmd_t pmd) +static inline phys_addr_t pmd_page_paddr(pmd_t pmd) { - return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK); + return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK; } /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + pte_index(addr)) +#define pte_offset_phys(dir,addr) (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t)) +#define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) @@ -473,21 +474,23 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } -static inline pmd_t *pud_page_vaddr(pud_t pud) +static inline phys_addr_t pud_page_paddr(pud_t pud) { - return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK); + return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK; } /* Find an entry in the second-level page table. */ #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) -static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) -{ - return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr); -} +#define pmd_offset_phys(dir, addr) (pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t)) +#define pmd_offset(dir, addr) ((pmd_t *)__va(pmd_offset_phys((dir), (addr)))) #define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) +#else + +#define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) + #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -509,21 +512,23 @@ static inline void pgd_clear(pgd_t *pgdp) set_pgd(pgdp, __pgd(0)); } -static inline pud_t *pgd_page_vaddr(pgd_t pgd) +static inline phys_addr_t pgd_page_paddr(pgd_t pgd) { - return __va(pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK); + return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK; } /* Find an entry in the frst-level page table. */ #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) -static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr) -{ - return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr); -} +#define pud_offset_phys(dir, addr) (pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t)) +#define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) +#else + +#define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) @@ -531,7 +536,9 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr) /* to find an entry in a page-table-directory */ #define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) -#define pgd_offset(mm, addr) ((mm)->pgd+pgd_index(addr)) +#define pgd_offset_raw(pgd, addr) ((pgd) + pgd_index(addr)) + +#define pgd_offset(mm, addr) (pgd_offset_raw((mm)->pgd, (addr))) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(addr) pgd_offset(&init_mm, addr) From 1845f62c4eb95f0dd288ac54bba784e0acb53171 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:05 +0000 Subject: [PATCH 0379/3220] UPSTREAM: arm64: mm: avoid redundant __pa(__va(x)) When we "upgrade" to a section mapping, we free any table we made redundant by giving it back to memblock. To get the PA, we acquire the physical address and convert this to a VA, then subsequently convert this back to a PA. This works currently, but will not work if the tables are not accessed via linear map VAs (e.g. is we use fixmap slots). This patch uses {pmd,pud}_page_paddr to acquire the PA. This avoids the __pa(__va()) round trip, saving some work and avoiding reliance on the linear mapping. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 316b39db06718d59d82736df9fc65cf05b467cc7) Signed-off-by: Jeff Vander Stoep Change-Id: I26ed77790b5b00a15ea09a5a23a176de5b73a002 --- arch/arm64/mm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index c0a4160331f8..fa2471ac191a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -171,7 +171,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, if (!pmd_none(old_pmd)) { flush_tlb_all(); if (pmd_table(old_pmd)) { - phys_addr_t table = __pa(pte_offset_map(&old_pmd, 0)); + phys_addr_t table = pmd_page_paddr(old_pmd); if (!WARN_ON_ONCE(slab_is_available())) memblock_free(table, PAGE_SIZE); } @@ -232,7 +232,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, if (!pud_none(old_pud)) { flush_tlb_all(); if (pud_table(old_pud)) { - phys_addr_t table = __pa(pmd_offset(&old_pud, 0)); + phys_addr_t table = pud_page_paddr(old_pud); if (!WARN_ON_ONCE(slab_is_available())) memblock_free(table, PAGE_SIZE); } From 04390d5e9950ee522e1f6f829615a14c2368a242 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:06 +0000 Subject: [PATCH 0380/3220] UPSTREAM: arm64: mm: add __{pud,pgd}_populate We currently have __pmd_populate for creating a pmd table entry given the physical address of a pte, but don't have equivalents for the pud or pgd levels of table. To enable us to manipulate tables which are mapped outside of the linear mapping (where we have a PA, but not a linear map VA), it is useful to have these functions. This patch adds __{pud,pgd}_populate. As these should not be called when the kernel uses folded {pmd,pud}s, in these cases they expand to BUILD_BUG(). So long as the appropriate checks are made on the {pud,pgd} entry prior to attempting population, these should be optimized out at compile time. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 1e531cce68c92b46c7d29f36a72f9a3e5886678f) Signed-off-by: Jeff Vander Stoep Change-Id: I4bb3b67d0e8b63dfbd1292a93bdde42ead23a5c2 --- arch/arm64/include/asm/pgalloc.h | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index c15053902942..ff98585d085a 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -42,11 +42,20 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot) { - set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE)); + set_pud(pud, __pud(pmd | prot)); } +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + __pud_populate(pud, __pa(pmd), PMD_TYPE_TABLE); +} +#else +static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot) +{ + BUILD_BUG(); +} #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -62,11 +71,20 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) free_page((unsigned long)pud); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot) { - set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE)); + set_pgd(pgdp, __pgd(pud | prot)); } +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + __pgd_populate(pgd, __pa(pud), PUD_TYPE_TABLE); +} +#else +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot) +{ + BUILD_BUG(); +} #endif /* CONFIG_PGTABLE_LEVELS > 3 */ extern pgd_t *pgd_alloc(struct mm_struct *mm); From d548784c60aa9b5be69218bcbbdaf5f7a767c659 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:07 +0000 Subject: [PATCH 0381/3220] UPSTREAM: arm64: mm: add functions to walk tables in fixmap As a preparatory step to allow us to allocate early page tables from unmapped memory using memblock_alloc, add new p??_{set,clear}_fixmap* functions which can be used to walk page tables outside of the linear mapping by using fixmap slots. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 961faac114819a01e627fe9c9c82b830bb3849d4) Signed-off-by: Jeff Vander Stoep Change-Id: I58a0f54d2c637ef0fda8e0b9187ade969aecd347 --- arch/arm64/include/asm/fixmap.h | 10 ++++++++++ arch/arm64/include/asm/pgtable.h | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 309704544d22..1a617d46fce9 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -62,6 +62,16 @@ enum fixed_addresses { FIX_BTMAP_END = __end_of_permanent_fixed_addresses, FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, + + /* + * Used for kernel page table creation, so unmapped memory may be used + * for tables. + */ + FIX_PTE, + FIX_PMD, + FIX_PUD, + FIX_PGD, + __end_of_fixed_addresses }; diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 837fc7b8a9a4..6a58dd48663f 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -57,6 +57,7 @@ #ifndef __ASSEMBLY__ +#include #include extern void __pte_error(const char *file, int line, unsigned long val); @@ -446,6 +447,10 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +#define pte_set_fixmap(addr) ((pte_t *)set_fixmap_offset(FIX_PTE, addr)) +#define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr)) +#define pte_clear_fixmap() clear_fixmap(FIX_PTE) + #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) /* @@ -485,12 +490,21 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pmd_offset_phys(dir, addr) (pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t)) #define pmd_offset(dir, addr) ((pmd_t *)__va(pmd_offset_phys((dir), (addr)))) +#define pmd_set_fixmap(addr) ((pmd_t *)set_fixmap_offset(FIX_PMD, addr)) +#define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr)) +#define pmd_clear_fixmap() clear_fixmap(FIX_PMD) + #define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) #else #define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) +/* Match pmd_offset folding in */ +#define pmd_set_fixmap(addr) NULL +#define pmd_set_fixmap_offset(pudp, addr) ((pmd_t *)pudp) +#define pmd_clear_fixmap() + #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -523,12 +537,21 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pud_offset_phys(dir, addr) (pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t)) #define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) +#define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr)) +#define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr)) +#define pud_clear_fixmap() clear_fixmap(FIX_PUD) + #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) #else #define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) +/* Match pud_offset folding in */ +#define pud_set_fixmap(addr) NULL +#define pud_set_fixmap_offset(pgdp, addr) ((pud_t *)pgdp) +#define pud_clear_fixmap() + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) @@ -543,6 +566,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(addr) pgd_offset(&init_mm, addr) +#define pgd_set_fixmap(addr) ((pgd_t *)set_fixmap_offset(FIX_PGD, addr)) +#define pgd_clear_fixmap() clear_fixmap(FIX_PGD) + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | From 7739d1799b628e8a52402f25034c960ee2e27a9a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:08 +0000 Subject: [PATCH 0382/3220] UPSTREAM: arm64: mm: use fixmap when creating page tables As a preparatory step to allow us to allocate early page tables from unmapped memory using memblock_alloc, modify the __create_mapping callees to map and unmap the tables they modify using fixmap entries. All but the top-level pgd initialisation is performed via the fixmap. Subsequent patches will inject the pgd physical address, and migrate to using the FIX_PGD slot. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit f4710445458c0a1bd1c3c014ada2e7d7dc7b882f) Signed-off-by: Jeff Vander Stoep Change-Id: I71d2fb02d009915b5aab7f3467c4a3c658e898de --- arch/arm64/mm/mmu.c | 61 ++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index fa2471ac191a..d7fa7f52f446 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -63,19 +63,30 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static void __init *early_pgtable_alloc(void) +static phys_addr_t __init early_pgtable_alloc(void) { phys_addr_t phys; void *ptr; phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE); BUG_ON(!phys); - ptr = __va(phys); + + /* + * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE + * slot will be free, so we can (ab)use the FIX_PTE slot to initialise + * any level of table. + */ + ptr = pte_set_fixmap(phys); + memset(ptr, 0, PAGE_SIZE); - /* Ensure the zeroed page is visible to the page table walker */ - dsb(ishst); - return ptr; + /* + * Implicit barriers also ensure the zeroed page is visible to the page + * table walker + */ + pte_clear_fixmap(); + + return phys; } /* @@ -99,24 +110,28 @@ static void split_pmd(pmd_t *pmd, pte_t *pte) static void alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { pte_t *pte; if (pmd_none(*pmd) || pmd_sect(*pmd)) { - pte = pgtable_alloc(); + phys_addr_t pte_phys = pgtable_alloc(); + pte = pte_set_fixmap(pte_phys); if (pmd_sect(*pmd)) split_pmd(pmd, pte); - __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE); + __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); flush_tlb_all(); + pte_clear_fixmap(); } BUG_ON(pmd_bad(*pmd)); - pte = pte_offset_kernel(pmd, addr); + pte = pte_set_fixmap_offset(pmd, addr); do { set_pte(pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); + + pte_clear_fixmap(); } static void split_pud(pud_t *old_pud, pmd_t *pmd) @@ -134,7 +149,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { pmd_t *pmd; unsigned long next; @@ -143,7 +158,8 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, * Check for initial section mappings in the pgd/pud and remove them. */ if (pud_none(*pud) || pud_sect(*pud)) { - pmd = pgtable_alloc(); + phys_addr_t pmd_phys = pgtable_alloc(); + pmd = pmd_set_fixmap(pmd_phys); if (pud_sect(*pud)) { /* * need to have the 1G of mappings continue to be @@ -151,12 +167,13 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, */ split_pud(pud, pmd); } - pud_populate(mm, pud, pmd); + __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE); flush_tlb_all(); + pmd_clear_fixmap(); } BUG_ON(pud_bad(*pud)); - pmd = pmd_offset(pud, addr); + pmd = pmd_set_fixmap_offset(pud, addr); do { next = pmd_addr_end(addr, end); /* try section mapping first */ @@ -182,6 +199,8 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, } phys += next - addr; } while (pmd++, addr = next, addr != end); + + pmd_clear_fixmap(); } static inline bool use_1G_block(unsigned long addr, unsigned long next, @@ -199,18 +218,18 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { pud_t *pud; unsigned long next; if (pgd_none(*pgd)) { - pud = pgtable_alloc(); - pgd_populate(mm, pgd, pud); + phys_addr_t pud_phys = pgtable_alloc(); + __pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE); } BUG_ON(pgd_bad(*pgd)); - pud = pud_offset(pgd, addr); + pud = pud_set_fixmap_offset(pgd, addr); do { next = pud_addr_end(addr, end); @@ -243,6 +262,8 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, } phys += next - addr; } while (pud++, addr = next, addr != end); + + pud_clear_fixmap(); } /* @@ -252,7 +273,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { unsigned long addr, length, end, next; @@ -275,14 +296,14 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, } while (pgd++, addr = next, addr != end); } -static void *late_pgtable_alloc(void) +static phys_addr_t late_pgtable_alloc(void) { void *ptr = (void *)__get_free_page(PGALLOC_GFP); BUG_ON(!ptr); /* Ensure the zeroed page is visible to the page table walker */ dsb(ishst); - return ptr; + return __pa(ptr); } static void __init create_mapping(phys_addr_t phys, unsigned long virt, From caf0cdb7ae7d5b5a4da584ff37228c3f26a55aa0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:09 +0000 Subject: [PATCH 0383/3220] UPSTREAM: arm64: mm: allocate pagetables anywhere Now that create_mapping uses fixmap slots to modify pte, pmd, and pud entries, we can access page tables anywhere in physical memory, regardless of the extent of the linear mapping. Given that, we no longer need to limit memblock allocations during page table creation, and can leave the limit as its default MEMBLOCK_ALLOC_ANYWHERE. We never add memory which will fall outside of the linear map range given phys_offset and MAX_MEMBLOCK_ADDR are configured appropriately, so any tables we create will fall in the linear map of the final tables. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit cdef5f6e9e0e5ee397759b664a9f875ff59ccf01) Signed-off-by: Jeff Vander Stoep Change-Id: I9b6487491f47351303a147147c3d274da20def59 --- arch/arm64/mm/mmu.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d7fa7f52f446..e41f30b35928 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -383,20 +383,6 @@ static void __init __map_memblock(phys_addr_t start, phys_addr_t end) static void __init map_mem(void) { struct memblock_region *reg; - phys_addr_t limit; - - /* - * Temporarily limit the memblock range. We need to do this as - * create_mapping requires puds, pmds and ptes to be allocated from - * memory addressable from the initial direct kernel mapping. - * - * The initial direct kernel mapping, located at swapper_pg_dir, gives - * us PUD_SIZE (with SECTION maps) or PMD_SIZE (without SECTION maps, - * memory starting from PHYS_OFFSET (which must be aligned to 2MB as - * per Documentation/arm64/booting.txt). - */ - limit = PHYS_OFFSET + SWAPPER_INIT_MAP_SIZE; - memblock_set_current_limit(limit); /* map all the memory banks */ for_each_memblock(memory, reg) { @@ -406,29 +392,8 @@ static void __init map_mem(void) if (start >= end) break; - if (ARM64_SWAPPER_USES_SECTION_MAPS) { - /* - * For the first memory bank align the start address and - * current memblock limit to prevent create_mapping() from - * allocating pte page tables from unmapped memory. With - * the section maps, if the first block doesn't end on section - * size boundary, create_mapping() will try to allocate a pte - * page, which may be returned from an unmapped area. - * When section maps are not used, the pte page table for the - * current limit is already present in swapper_pg_dir. - */ - if (start < limit) - start = ALIGN(start, SECTION_SIZE); - if (end < limit) { - limit = end & SECTION_MASK; - memblock_set_current_limit(limit); - } - } __map_memblock(start, end); } - - /* Limit no longer required. */ - memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); } static void __init fixup_executable(void) From 880bee2ed6ea6051d1c387a8eb2cd81140b2ec64 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:10 +0000 Subject: [PATCH 0384/3220] UPSTREAM: arm64: mm: allow passing a pgdir to alloc_init_* To allow us to initialise pgdirs which are fixmapped, allow explicitly passing a pgdir rather than an mm. A new __create_pgd_mapping function is added for this, with existing __create_mapping callers migrated to this. The mm argument was previously only used at the top level. Now that it is redundant at all levels, it is removed. To indicate its new found similarity to alloc_init_{pud,pmd,pte}, __create_mapping is renamed to init_pgd. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 11509a306bb6ea595878b2d246d2d56b1783e040) Signed-off-by: Jeff Vander Stoep Change-Id: I554f90b986a0fce6c96c0a0a9da1d6e61602d0a9 --- arch/arm64/mm/mmu.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e41f30b35928..f949fe238643 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -146,8 +146,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) } while (pmd++, i++, i < PTRS_PER_PMD); } -static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, +static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) { @@ -215,8 +214,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, return true; } -static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, +static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) { @@ -257,7 +255,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, } } } else { - alloc_init_pmd(mm, pud, addr, next, phys, prot, + alloc_init_pmd(pud, addr, next, phys, prot, pgtable_alloc); } phys += next - addr; @@ -270,8 +268,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, * Create the page directory entries and any necessary page tables for the * mapping specified by 'md'. */ -static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, - phys_addr_t phys, unsigned long virt, +static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) { @@ -291,7 +288,7 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, end = addr + length; do { next = pgd_addr_end(addr, end); - alloc_init_pud(mm, pgd, addr, next, phys, prot, pgtable_alloc); + alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc); phys += next - addr; } while (pgd++, addr = next, addr != end); } @@ -306,6 +303,14 @@ static phys_addr_t late_pgtable_alloc(void) return __pa(ptr); } +static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, + unsigned long virt, phys_addr_t size, + pgprot_t prot, + phys_addr_t (*alloc)(void)) +{ + init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc); +} + static void __init create_mapping(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { @@ -314,16 +319,16 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, - size, prot, early_pgtable_alloc); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, + early_pgtable_alloc); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { - __create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot, - late_pgtable_alloc); + __create_pgd_mapping(mm->pgd, phys, virt, size, prot, + late_pgtable_alloc); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, @@ -335,8 +340,8 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, return; } - return __create_mapping(&init_mm, pgd_offset_k(virt), - phys, virt, size, prot, late_pgtable_alloc); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, + late_pgtable_alloc); } #ifdef CONFIG_DEBUG_RODATA From 9f31360553319ee4e8fc3a1c97ef4788c024d6de Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:11 +0000 Subject: [PATCH 0385/3220] BACKPORT: arm64: ensure _stext and _etext are page-aligned Currently we have separate ALIGN_DEBUG_RO{,_MIN} directives to align _etext and __init_begin. While we ensure that __init_begin is page-aligned, we do not provide the same guarantee for _etext. This is not problematic currently as the alignment of __init_begin is sufficient to prevent issues when we modify permissions. Subsequent patches will assume page alignment of segments of the kernel we wish to map with different permissions. To ensure this, move _etext after the ALIGN_DEBUG_RO_MIN for the init section. This renders the prior ALIGN_DEBUG_RO irrelevant, and hence it is removed. Likewise, upgrade to ALIGN_DEBUG_RO_MIN(PAGE_SIZE) for _stext. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit fca082bfb543ccaaff864fc0892379ccaa1711cd) Signed-off-by: Jeff Vander Stoep Change-Id: If1a829aa5c363f2c76eb1a18209c3c00ee3ffd0a --- arch/arm64/kernel/vmlinux.lds.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 1a056bb26633..763e5aab5752 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -95,7 +95,7 @@ SECTIONS _text = .; HEAD_TEXT } - ALIGN_DEBUG_RO + ALIGN_DEBUG_RO_MIN(PAGE_SIZE) .text : { /* Real text segment */ _stext = .; /* Text and read-only data */ __exception_text_start = .; @@ -118,9 +118,9 @@ SECTIONS RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(8) NOTES - ALIGN_DEBUG_RO ALIGN_DEBUG_RO_MIN(PAGE_SIZE) + _etext = .; /* End of text and rodata section */ __init_begin = .; INIT_TEXT_SECTION(8) From 63048c3fad6dd5219008c7d842cf1a5e4bac5508 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:12 +0000 Subject: [PATCH 0386/3220] BACKPORT: arm64: mm: create new fine-grained mappings at boot At boot we may change the granularity of the tables mapping the kernel (by splitting or making sections). This may happen when we create the linear mapping (in __map_memblock), or at any point we try to apply fine-grained permissions to the kernel (e.g. fixup_executable, mark_rodata_ro, fixup_init). Changing the active page tables in this manner may result in multiple entries for the same address being allocated into TLBs, risking problems such as TLB conflict aborts or issues derived from the amalgamation of TLB entries. Generally, a break-before-make (BBM) approach is necessary to avoid conflicts, but we cannot do this for the kernel tables as it risks unmapping text or data being used to do so. Instead, we can create a new set of tables from scratch in the safety of the existing mappings, and subsequently migrate over to these using the new cpu_replace_ttbr1 helper, which avoids the two sets of tables being active simultaneously. To avoid issues when we later modify permissions of the page tables (e.g. in fixup_init), we must create the page tables at a granularity such that later modification does not result in splitting of tables. This patch applies this strategy, creating a new set of fine-grained page tables from scratch, and safely migrating to them. The existing fixmap and kasan shadow page tables are reused in the new fine-grained tables. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Cc: Andrey Ryabinin Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: rework-pagetable (cherry picked from commit 068a17a5805dfbca4bbf03e664ca6b19709cc7a8) Signed-off-by: Jeff Vander Stoep Change-Id: I7ddaa296b94106846ebf8392d91186df8673df64 --- arch/arm64/include/asm/kasan.h | 3 + arch/arm64/mm/kasan_init.c | 15 ++++ arch/arm64/mm/mmu.c | 154 ++++++++++++++++++++------------- 3 files changed, 110 insertions(+), 62 deletions(-) diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h index 2774fa384c47..de0d21211c34 100644 --- a/arch/arm64/include/asm/kasan.h +++ b/arch/arm64/include/asm/kasan.h @@ -7,6 +7,7 @@ #include #include +#include /* * KASAN_SHADOW_START: beginning of the kernel virtual addresses. @@ -28,10 +29,12 @@ #define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << (64 - 3))) void kasan_init(void); +void kasan_copy_shadow(pgd_t *pgdir); asmlinkage void kasan_early_init(void); #else static inline void kasan_init(void) { } +static inline void kasan_copy_shadow(pgd_t *pgdir) { } #endif #endif diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 263b59020500..cc569a38bc76 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -97,6 +97,21 @@ asmlinkage void __init kasan_early_init(void) kasan_map_early_shadow(); } +/* + * Copy the current shadow region into a new pgdir. + */ +void __init kasan_copy_shadow(pgd_t *pgdir) +{ + pgd_t *pgd, *pgd_new, *pgd_end; + + pgd = pgd_offset_k(KASAN_SHADOW_START); + pgd_end = pgd_offset_k(KASAN_SHADOW_END); + pgd_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START); + do { + set_pgd(pgd_new, *pgd); + } while (pgd++, pgd_new++, pgd != pgd_end); +} + static void __init clear_pgds(unsigned long start, unsigned long end) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f949fe238643..6f5770a1b140 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -344,48 +345,42 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, late_pgtable_alloc); } -#ifdef CONFIG_DEBUG_RODATA -static void __init __map_memblock(phys_addr_t start, phys_addr_t end) +static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { + + unsigned long kernel_start = __pa(_stext); + unsigned long kernel_end = __pa(_end); + /* - * Set up the executable regions using the existing section mappings - * for now. This will get more fine grained later once all memory - * is mapped + * The kernel itself is mapped at page granularity. Map all other + * memory, making sure we don't overwrite the existing kernel mappings. */ - unsigned long kernel_x_start = round_down(__pa(_stext), SWAPPER_BLOCK_SIZE); - unsigned long kernel_x_end = round_up(__pa(__init_end), SWAPPER_BLOCK_SIZE); - if (end < kernel_x_start) { - create_mapping(start, __phys_to_virt(start), - end - start, PAGE_KERNEL); - } else if (start >= kernel_x_end) { - create_mapping(start, __phys_to_virt(start), - end - start, PAGE_KERNEL); - } else { - if (start < kernel_x_start) - create_mapping(start, __phys_to_virt(start), - kernel_x_start - start, - PAGE_KERNEL); - create_mapping(kernel_x_start, - __phys_to_virt(kernel_x_start), - kernel_x_end - kernel_x_start, - PAGE_KERNEL_EXEC); - if (kernel_x_end < end) - create_mapping(kernel_x_end, - __phys_to_virt(kernel_x_end), - end - kernel_x_end, - PAGE_KERNEL); + /* No overlap with the kernel. */ + if (end < kernel_start || start >= kernel_end) { + __create_pgd_mapping(pgd, start, __phys_to_virt(start), + end - start, PAGE_KERNEL, + early_pgtable_alloc); + return; } -} -#else -static void __init __map_memblock(phys_addr_t start, phys_addr_t end) -{ - create_mapping(start, __phys_to_virt(start), end - start, - PAGE_KERNEL_EXEC); -} -#endif -static void __init map_mem(void) + /* + * This block overlaps the kernel mapping. Map the portion(s) which + * don't overlap. + */ + if (start < kernel_start) + __create_pgd_mapping(pgd, start, + __phys_to_virt(start), + kernel_start - start, PAGE_KERNEL, + early_pgtable_alloc); + if (kernel_end < end) + __create_pgd_mapping(pgd, kernel_end, + __phys_to_virt(kernel_end), + end - kernel_end, PAGE_KERNEL, + early_pgtable_alloc); +} + +static void __init map_mem(pgd_t *pgd) { struct memblock_region *reg; @@ -397,33 +392,10 @@ static void __init map_mem(void) if (start >= end) break; - __map_memblock(start, end); + __map_memblock(pgd, start, end); } } -static void __init fixup_executable(void) -{ -#ifdef CONFIG_DEBUG_RODATA - /* now that we are actually fully mapped, make the start/end more fine grained */ - if (!IS_ALIGNED((unsigned long)_stext, SWAPPER_BLOCK_SIZE)) { - unsigned long aligned_start = round_down(__pa(_stext), - SWAPPER_BLOCK_SIZE); - - create_mapping(aligned_start, __phys_to_virt(aligned_start), - __pa(_stext) - aligned_start, - PAGE_KERNEL); - } - - if (!IS_ALIGNED((unsigned long)__init_end, SWAPPER_BLOCK_SIZE)) { - unsigned long aligned_end = round_up(__pa(__init_end), - SWAPPER_BLOCK_SIZE); - create_mapping(__pa(__init_end), (unsigned long)__init_end, - aligned_end - __pa(__init_end), - PAGE_KERNEL); - } -#endif -} - #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { @@ -441,14 +413,72 @@ void fixup_init(void) PAGE_KERNEL); } +static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, + pgprot_t prot) +{ + phys_addr_t pa_start = __pa(va_start); + unsigned long size = va_end - va_start; + + BUG_ON(!PAGE_ALIGNED(pa_start)); + BUG_ON(!PAGE_ALIGNED(size)); + + __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, + early_pgtable_alloc); +} + +/* + * Create fine-grained mappings for the kernel. + */ +static void __init map_kernel(pgd_t *pgd) +{ + + map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC); + map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC); + map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL); + + /* + * The fixmap falls in a separate pgd to the kernel, and doesn't live + * in the carveout for the swapper_pg_dir. We can simply re-use the + * existing dir for the fixmap. + */ + set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START)); + + kasan_copy_shadow(pgd); +} + /* * paging_init() sets up the page tables, initialises the zone memory * maps and sets up the zero page. */ void __init paging_init(void) { - map_mem(); - fixup_executable(); + phys_addr_t pgd_phys = early_pgtable_alloc(); + pgd_t *pgd = pgd_set_fixmap(pgd_phys); + + map_kernel(pgd); + map_mem(pgd); + + /* + * We want to reuse the original swapper_pg_dir so we don't have to + * communicate the new address to non-coherent secondaries in + * secondary_entry, and so cpu_switch_mm can generate the address with + * adrp+add rather than a load from some global variable. + * + * To do this we need to go via a temporary pgd. + */ + cpu_replace_ttbr1(__va(pgd_phys)); + memcpy(swapper_pg_dir, pgd, PAGE_SIZE); + cpu_replace_ttbr1(swapper_pg_dir); + + pgd_clear_fixmap(); + memblock_free(pgd_phys, PAGE_SIZE); + + /* + * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd + * allocated with it. + */ + memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE, + SWAPPER_DIR_SIZE - PAGE_SIZE); bootmem_init(); } From 17e35cbf1863e5cf74425594056ef28b18f762bf Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Tue, 23 Aug 2016 18:22:33 -0400 Subject: [PATCH 0387/3220] UPSTREAM: tun: fix transmit timestamp support Instead of using sock_tx_timestamp, use skb_tx_timestamp to record software transmit timestamp of a packet. sock_tx_timestamp resets and overrides the tx_flags of the skb. The function is intended to be called from within the protocol layer when creating the skb, not from a device driver. This is inconsistent with other drivers and will cause issues for TCP. In TCP, we intend to sample the timestamps for the last byte for each sendmsg/sendpage. For that reason, tcp_sendmsg calls tcp_tx_timestamp only with the last skb that it generates. For example, if a 128KB message is split into two 64KB packets we want to sample the SND timestamp of the last packet. The current code in the tun driver, however, will result in sampling the SND timestamp for both packets. Also, when the last packet is split into smaller packets for retranmission (see tcp_fragment), the tun driver will record timestamps for all of the retransmitted packets and not only the last packet. Change-Id: If7458ab31de52aa15a12364b6c1ac2a8f93f17a7 Fixes: eda297729171 (tun: Support software transmit time stamping.) Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Francis Yan Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2228a539184a..7b4c65effe5e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -858,10 +858,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; - if (skb->sk && sk_fullsock(skb->sk)) { - sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags); - sw_tx_timestamp(skb); - } + skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. From fa42b29f530e15078b4e3070fd62e310211ee3de Mon Sep 17 00:00:00 2001 From: Eric Ernst Date: Fri, 2 Sep 2016 16:12:06 -0700 Subject: [PATCH 0388/3220] input: keyreset: switch to orderly_reboot Prior restart function would make a call to sys_sync and then execute a kernel reset. Rather than call the sync directly, thus necessitating this driver to be builtin, call orderly_reboot, which will take care of the file system sync. Note: since CONFIG_INPUT Kconfig is tristate, this driver can be built as module, despite being marked bool. Signed-off-by: Eric Ernst --- drivers/input/keyreset.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/input/keyreset.c b/drivers/input/keyreset.c index 7fbf7247e65f..7e5222aec7c1 100644 --- a/drivers/input/keyreset.c +++ b/drivers/input/keyreset.c @@ -32,8 +32,7 @@ struct keyreset_state { static void do_restart(struct work_struct *unused) { - sys_sync(); - kernel_restart(NULL); + orderly_reboot(); } static void do_reset_fn(void *priv) From 5449876ad748ec9eae64267f3d7aab93c82a464d Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Mon, 19 Sep 2016 14:37:34 -0700 Subject: [PATCH 0389/3220] eas/sched/fair: Fixing comments in find_best_target. Change-Id: I83f5b9887e98f9fdb81318cde45408e7ebfc4b13 Signed-off-by: Srinath Sridharan --- kernel/sched/fair.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2cd3e2671f16..7e42cd11e430 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5674,17 +5674,19 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if(prefer_idle) { - // Find a target cpu with lowest - // utilization. + if (prefer_idle) { + /* Find a target cpu with highest + * utilization. + */ if (target_util == 0 || target_util < new_util) { target_cpu = i; target_util = new_util; } } else { - // Find a target cpu with highest - // utilization. + /* Find a target cpu with lowest + * utilization. + */ if (target_util == 0 || target_util > new_util) { target_cpu = i; From b7c491d2c40fde253ec06a63b3f93ea60c0467a9 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Sun, 18 Sep 2016 21:39:28 -0700 Subject: [PATCH 0390/3220] ANDROID: fiq_debugger: Pass task parameter to unwind_frame() Fixes: fe13f95b7200 ("arm64: pass a task parameter to unwind_frame()") Bug: 30369029 Patchset: rework-pagetable Change-Id: I9a4ab50ef61532d27282f189f063c938c196ec08 Signed-off-by: Jeff Vander Stoep --- drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c index 99c6584fcfa5..97246bcbcd62 100644 --- a/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c +++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c @@ -197,6 +197,6 @@ void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output, frame.sp = regs->sp; frame.pc = regs->pc; output->printf(output, "\n"); - walk_stackframe(&frame, report_trace, &sts); + walk_stackframe(current, &frame, report_trace, &sts); } } From cf43809d7aa07868ffacd83892745c85f91c8a53 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 20 Sep 2016 17:00:47 +0100 Subject: [PATCH 0391/3220] sched/walt: Drop arch-specific timer access On at least one platform, occasionally the timer providing the wallclock was able to be reset/go backwards for at least some time after wakeup. Accept that this might happen and warn the first time, but otherwise just carry on. Change-Id: Id3164477ba79049561af7f0889cbeebc199ead4e Signed-off-by: Chris Redpath --- kernel/sched/walt.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 07b7f84b37e2..2ffb1680b380 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "sched.h" #include "walt.h" @@ -188,10 +187,8 @@ update_window_start(struct rq *rq, u64 wallclock) delta = wallclock - rq->window_start; /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */ if (delta < 0) { - if (arch_timer_read_counter() == 0) - delta = 0; - else - BUG_ON(1); + delta = 0; + WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n"); } if (delta < walt_ravg_window) From 32cbbe59538d2dd0b77822cc27ce32ca487c467d Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Mon, 19 Sep 2016 17:33:50 -0700 Subject: [PATCH 0392/3220] ANDROID: fs: FS tracepoints to track IO. Adds tracepoints in ext4/f2fs/mpage to track readpages/buffered write()s. This allows us to track files that are being read/written to PIDs. Change-Id: I26bd36f933108927d6903da04d8cb42fd9c3ef3d Signed-off-by: Mohan Srinivasan --- fs/ext4/inline.c | 6 ++ fs/ext4/inode.c | 27 ++++++++ fs/ext4/readpage.c | 41 +++++++++-- fs/f2fs/data.c | 21 ++++++ fs/f2fs/inline.c | 11 +++ fs/mpage.c | 30 ++++++++ include/trace/events/android_fs.h | 31 +++++++++ include/trace/events/android_fs_template.h | 79 ++++++++++++++++++++++ 8 files changed, 242 insertions(+), 4 deletions(-) create mode 100644 include/trace/events/android_fs.h create mode 100644 include/trace/events/android_fs_template.h diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d884989cc83d..af34979684a4 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -18,6 +18,7 @@ #include "ext4.h" #include "xattr.h" #include "truncate.h" +#include #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) @@ -500,6 +501,9 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) return -EAGAIN; } + trace_android_fs_dataread_start(inode, page_offset(page), PAGE_SIZE, + current->pid, current->comm); + /* * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. @@ -511,6 +515,8 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) SetPageUptodate(page); } + trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE); + up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ea433a7f4bca..a7208a662ee6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -44,6 +44,7 @@ #include "truncate.h" #include +#include #define MPAGE_DA_EXTENT_TAIL 0x01 @@ -982,6 +983,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; + trace_android_fs_datawrite_start(inode, pos, len, + current->pid, current->comm); trace_ext4_write_begin(inode, pos, len, flags); /* * Reserve one block more for addition to orphan list in case @@ -1118,6 +1121,7 @@ static int ext4_write_end(struct file *file, int ret = 0, ret2; int i_size_changed = 0; + trace_android_fs_datawrite_end(inode, pos, len); trace_ext4_write_end(inode, pos, len, copied); if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { ret = ext4_jbd2_file_inode(handle, inode); @@ -1226,6 +1230,7 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; int size_changed = 0; + trace_android_fs_datawrite_end(inode, pos, len); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -2670,6 +2675,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, len, flags, pagep, fsdata); } *fsdata = (void *)0; + trace_android_fs_datawrite_start(inode, pos, len, + current->pid, current->comm); trace_ext4_da_write_begin(inode, pos, len, flags); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -2788,6 +2795,7 @@ static int ext4_da_write_end(struct file *file, return ext4_write_end(file, mapping, pos, len, copied, page, fsdata); + trace_android_fs_datawrite_end(inode, pos, len); trace_ext4_da_write_end(inode, pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); end = start + copied - 1; @@ -3276,12 +3284,31 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (ext4_has_inline_data(inode)) return 0; + if (trace_android_fs_dataread_start_enabled() && + (iov_iter_rw(iter) == READ)) + trace_android_fs_dataread_start(inode, offset, count, + current->pid, + current->comm); + if (trace_android_fs_datawrite_start_enabled() && + (iov_iter_rw(iter) == WRITE)) + trace_android_fs_datawrite_start(inode, offset, count, + current->pid, + current->comm); + trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(iocb, iter, offset); else ret = ext4_ind_direct_IO(iocb, iter, offset); trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); + + if (trace_android_fs_dataread_start_enabled() && + (iov_iter_rw(iter) == READ)) + trace_android_fs_dataread_end(inode, offset, count); + if (trace_android_fs_datawrite_start_enabled() && + (iov_iter_rw(iter) == WRITE)) + trace_android_fs_datawrite_end(inode, offset, count); + return ret; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 5dc5e95063de..1ce24a6759a0 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -45,6 +45,7 @@ #include #include "ext4.h" +#include /* * Call ext4_decrypt on every single page, reusing the encryption @@ -86,6 +87,17 @@ static inline bool ext4_bio_encrypted(struct bio *bio) #endif } +static void +ext4_trace_read_completion(struct bio *bio) +{ + struct page *first_page = bio->bi_io_vec[0].bv_page; + + if (first_page != NULL) + trace_android_fs_dataread_end(first_page->mapping->host, + page_offset(first_page), + bio->bi_iter.bi_size); +} + /* * I/O completion handler for multipage BIOs. * @@ -103,6 +115,9 @@ static void mpage_end_io(struct bio *bio) struct bio_vec *bv; int i; + if (trace_android_fs_dataread_start_enabled()) + ext4_trace_read_completion(bio); + if (ext4_bio_encrypted(bio)) { struct ext4_crypto_ctx *ctx = bio->bi_private; @@ -130,6 +145,24 @@ static void mpage_end_io(struct bio *bio) bio_put(bio); } +static void +ext4_submit_bio_read(struct bio *bio) +{ + if (trace_android_fs_dataread_start_enabled()) { + struct page *first_page = bio->bi_io_vec[0].bv_page; + + if (first_page != NULL) { + trace_android_fs_dataread_start( + first_page->mapping->host, + page_offset(first_page), + bio->bi_iter.bi_size, + current->pid, + current->comm); + } + } + submit_bio(READ, bio); +} + int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages) @@ -271,7 +304,7 @@ int ext4_mpage_readpages(struct address_space *mapping, */ if (bio && (last_block_in_bio != blocks[0] - 1)) { submit_and_realloc: - submit_bio(READ, bio); + ext4_submit_bio_read(bio); bio = NULL; } if (bio == NULL) { @@ -303,14 +336,14 @@ int ext4_mpage_readpages(struct address_space *mapping, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { - submit_bio(READ, bio); + ext4_submit_bio_read(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; goto next_page; confused: if (bio) { - submit_bio(READ, bio); + ext4_submit_bio_read(bio); bio = NULL; } if (!PageUptodate(page)) @@ -323,6 +356,6 @@ int ext4_mpage_readpages(struct address_space *mapping, } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + ext4_submit_bio_read(bio); return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 972eab7ac071..e692958d6e78 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -26,6 +26,7 @@ #include "segment.h" #include "trace.h" #include +#include static void f2fs_read_end_io(struct bio *bio) { @@ -1401,6 +1402,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct dnode_of_data dn; int err = 0; + trace_android_fs_datawrite_start(inode, pos, len, + current->pid, current->comm); trace_f2fs_write_begin(inode, pos, len, flags); f2fs_balance_fs(sbi); @@ -1529,6 +1532,7 @@ static int f2fs_write_end(struct file *file, { struct inode *inode = page->mapping->host; + trace_android_fs_datawrite_end(inode, pos, len); trace_f2fs_write_end(inode, pos, len, copied); set_page_dirty(page); @@ -1582,6 +1586,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + if (trace_android_fs_dataread_start_enabled() && + (iov_iter_rw(iter) == READ)) + trace_android_fs_dataread_start(inode, offset, + count, current->pid, + current->comm); + if (trace_android_fs_datawrite_start_enabled() && + (iov_iter_rw(iter) == WRITE)) + trace_android_fs_datawrite_start(inode, offset, count, + current->pid, current->comm); + if (iov_iter_rw(iter) == WRITE) { __allocate_data_blocks(inode, offset, count); if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { @@ -1595,6 +1609,13 @@ out: if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); + if (trace_android_fs_dataread_start_enabled() && + (iov_iter_rw(iter) == READ)) + trace_android_fs_dataread_end(inode, offset, count); + if (trace_android_fs_datawrite_start_enabled() && + (iov_iter_rw(iter) == WRITE)) + trace_android_fs_datawrite_end(inode, offset, count); + trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); return err; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bda7126466c0..d2c5d69ba0b1 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -13,6 +13,7 @@ #include "f2fs.h" #include "node.h" +#include bool f2fs_may_inline_data(struct inode *inode) { @@ -84,14 +85,22 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) { struct page *ipage; + trace_android_fs_dataread_start(inode, page_offset(page), + PAGE_SIZE, current->pid, + current->comm); + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { + trace_android_fs_dataread_end(inode, page_offset(page), + PAGE_SIZE); unlock_page(page); return PTR_ERR(ipage); } if (!f2fs_has_inline_data(inode)) { f2fs_put_page(ipage, 1); + trace_android_fs_dataread_end(inode, page_offset(page), + PAGE_SIZE); return -EAGAIN; } @@ -102,6 +111,8 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) SetPageUptodate(page); f2fs_put_page(ipage, 1); + trace_android_fs_dataread_end(inode, page_offset(page), + PAGE_SIZE); unlock_page(page); return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index 1480d3a18037..5c65d8942692 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -30,6 +30,14 @@ #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_start); +EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_end); +EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_start); +EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_end); + /* * I/O completion handler for multipage BIOs. * @@ -47,6 +55,16 @@ static void mpage_end_io(struct bio *bio) struct bio_vec *bv; int i; + if (trace_android_fs_dataread_end_enabled() && + (bio_data_dir(bio) == READ)) { + struct page *first_page = bio->bi_io_vec[0].bv_page; + + if (first_page != NULL) + trace_android_fs_dataread_end(first_page->mapping->host, + page_offset(first_page), + bio->bi_iter.bi_size); + } + bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; page_endio(page, bio_data_dir(bio), bio->bi_error); @@ -57,6 +75,18 @@ static void mpage_end_io(struct bio *bio) static struct bio *mpage_bio_submit(int rw, struct bio *bio) { + if (trace_android_fs_dataread_start_enabled() && (rw == READ)) { + struct page *first_page = bio->bi_io_vec[0].bv_page; + + if (first_page != NULL) { + trace_android_fs_dataread_start( + first_page->mapping->host, + page_offset(first_page), + bio->bi_iter.bi_size, + current->pid, + current->comm); + } + } bio->bi_end_io = mpage_end_io; guard_bio_eod(rw, bio); submit_bio(rw, bio); diff --git a/include/trace/events/android_fs.h b/include/trace/events/android_fs.h new file mode 100644 index 000000000000..531da433a7bc --- /dev/null +++ b/include/trace/events/android_fs.h @@ -0,0 +1,31 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM android_fs + +#if !defined(_TRACE_ANDROID_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ANDROID_FS_H + +#include +#include + +DEFINE_EVENT(android_fs_data_start_template, android_fs_dataread_start, + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *command), + TP_ARGS(inode, offset, bytes, pid, command)); + +DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end, + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + TP_ARGS(inode, offset, bytes)); + +DEFINE_EVENT(android_fs_data_start_template, android_fs_datawrite_start, + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *command), + TP_ARGS(inode, offset, bytes, pid, command)); + +DEFINE_EVENT(android_fs_data_end_template, android_fs_datawrite_end, + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + TP_ARGS(inode, offset, bytes)); + +#endif /* _TRACE_ANDROID_FS_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h new file mode 100644 index 000000000000..618988b047c1 --- /dev/null +++ b/include/trace/events/android_fs_template.h @@ -0,0 +1,79 @@ +#if !defined(_TRACE_ANDROID_FS_TEMPLATE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ANDROID_FS_TEMPLATE_H + +#include + +DECLARE_EVENT_CLASS(android_fs_data_start_template, + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *command), + TP_ARGS(inode, offset, bytes, pid, command), + TP_STRUCT__entry( + __array(char, path, MAX_FILTER_STR_VAL); + __field(char *, pathname); + __field(loff_t, offset); + __field(int, bytes); + __field(loff_t, i_size); + __string(cmdline, command); + __field(pid_t, pid); + __field(ino_t, ino); + ), + TP_fast_assign( + { + struct dentry *d; + + /* + * Grab a reference to the inode here because + * d_obtain_alias() will either drop the inode + * reference if it locates an existing dentry + * or transfer the reference to the new dentry + * created. In our case, the file is still open, + * so the dentry is guaranteed to exist (connected), + * so d_obtain_alias() drops the reference we + * grabbed here. + */ + ihold(inode); + d = d_obtain_alias(inode); + if (!IS_ERR(d)) { + __entry->pathname = dentry_path(d, + __entry->path, + MAX_FILTER_STR_VAL); + dput(d); + } else + __entry->pathname = ERR_PTR(-EINVAL); + __entry->offset = offset; + __entry->bytes = bytes; + __entry->i_size = i_size_read(inode); + __assign_str(cmdline, command); + __entry->pid = pid; + __entry->ino = inode->i_ino; + } + ), + TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," + " pid %d, i_size %llu, ino %lu", + (IS_ERR(__entry->pathname) ? "ERROR" : __entry->pathname), + __entry->offset, __entry->bytes, __get_str(cmdline), + __entry->pid, __entry->i_size, + (unsigned long) __entry->ino) +); + +DECLARE_EVENT_CLASS(android_fs_data_end_template, + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + TP_ARGS(inode, offset, bytes), + TP_STRUCT__entry( + __field(ino_t, ino); + __field(loff_t, offset); + __field(int, bytes); + ), + TP_fast_assign( + { + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->bytes = bytes; + } + ), + TP_printk("ino %lu, offset %llu, bytes %d", + (unsigned long) __entry->ino, + __entry->offset, __entry->bytes) +); + +#endif /* _TRACE_ANDROID_FS_TEMPLATE_H */ From df232437710122fcb4e4a0484a1eded5aec29a6a Mon Sep 17 00:00:00 2001 From: Caesar Wang Date: Tue, 23 Aug 2016 11:47:02 +0100 Subject: [PATCH 0393/3220] sched/fair: remove printk while schedule is in progress It will cause deadlock and while(1) if call printk while schedule is in progress. The block state like as below: cpu0(hold the console sem): printk->console_unlock->up_sem->spin_lock(&sem->lock)->wake_up_process(cpu1) ->try_to_wake_up(cpu1)->while(p->on_cpu). cpu1(request console sem): console_lock->down_sem->schedule->idle_banlance->update_cpu_capacity-> printk->console_trylock->spin_lock(&sem->lock). p->on_cpu will be 1 forever, because the task is still running on cpu1, so cpu0 is blocked in while(p->on_cpu), but cpu1 could not get spin_lock(&sem->lock), it is blocked too, it means the task will running on cpu1 forever. Signed-off-by: Caesar Wang --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e42cd11e430..6b881cf81d79 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7089,7 +7089,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) mcc->cpu = cpu; #ifdef CONFIG_SCHED_DEBUG raw_spin_unlock_irqrestore(&mcc->lock, flags); - pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n", + cpu, capacity); goto skip_unlock; #endif } From 53c63d3b4f06e31269d66173d327164f273e4233 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 20 Sep 2016 18:42:22 -0700 Subject: [PATCH 0394/3220] sched: Add Kconfig option DEFAULT_USE_ENERGY_AWARE to set ENERGY_AWARE feature flag The ENERGY_AWARE sched feature flag cannot be set unless CONFIG_SCHED_DEBUG is enabled. So this patch allows the flag to default to true at build time if the config is set. Change-Id: I8835a571fdb7a8f8ee6a54af1e11a69f3b5ce8e6 Signed-off-by: John Stultz --- init/Kconfig | 10 ++++++++++ kernel/sched/features.h | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index e71e35cf723c..acb6645ffda5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1290,6 +1290,16 @@ config SCHED_TUNE If unsure, say N. +config DEFAULT_USE_ENERGY_AWARE + bool "Default to enabling the Energy Aware Scheduler feature" + default n + help + This option defaults the ENERGY_AWARE scheduling feature to true, + as without SCHED_DEBUG set this feature can't be enabled or disabled + via sysctl. + + Say N if unsure. + config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/sched/features.h b/kernel/sched/features.h index b634151ce286..55e461055332 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -73,4 +73,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) * Energy aware scheduling. Use platform energy model to guide scheduling * decisions optimizing for energy efficiency. */ +#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE +SCHED_FEAT(ENERGY_AWARE, true) +#else SCHED_FEAT(ENERGY_AWARE, false) +#endif From 55b6a243332cde20882ba331a429d4ce5ec93ae7 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 26 Jan 2016 11:10:38 +0000 Subject: [PATCH 0395/3220] BACKPORT: arm64: kernel: implement ACPI parking protocol The SBBR and ACPI specifications allow ACPI based systems that do not implement PSCI (eg systems with no EL3) to boot through the ACPI parking protocol specification[1]. This patch implements the ACPI parking protocol CPU operations, and adds code that eases parsing the parking protocol data structures to the ARM64 SMP initializion carried out at the same time as cpus enumeration. To wake-up the CPUs from the parked state, this patch implements a wakeup IPI for ARM64 (ie arch_send_wakeup_ipi_mask()) that mirrors the ARM one, so that a specific IPI is sent for wake-up purpose in order to distinguish it from other IPI sources. Given the current ACPI MADT parsing API, the patch implements a glue layer that helps passing MADT GICC data structure from SMP initialization code to the parking protocol implementation somewhat overriding the CPU operations interfaces. This to avoid creating a completely trasparent DT/ACPI CPU operations layer that would require creating opaque structure handling for CPUs data (DT represents CPU through DT nodes, ACPI through static MADT table entries), which seems overkill given that ACPI on ARM64 mandates only two booting protocols (PSCI and parking protocol), so there is no need for further protocol additions. Based on the original work by Mark Salter [1] https://acpica.org/sites/acpica/files/MP%20Startup%20for%20ARM%20platforms.docx Signed-off-by: Lorenzo Pieralisi Tested-by: Loc Ho Cc: Will Deacon Cc: Hanjun Guo Cc: Sudeep Holla Cc: Mark Rutland Cc: Mark Salter Cc: Al Stone [catalin.marinas@arm.com: Added WARN_ONCE(!acpi_parking_protocol_valid() on the IPI] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 5e89c55e4ed81d7abb1ce8828db35fa389dc0e90) Signed-off-by: Jeff Vander Stoep Change-Id: Ie9a872bff124ca6b3551c812df768cc378658bcc --- arch/arm64/Kconfig | 9 ++ arch/arm64/include/asm/acpi.h | 19 ++- arch/arm64/include/asm/hardirq.h | 2 +- arch/arm64/include/asm/smp.h | 9 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/acpi_parking_protocol.c | 153 ++++++++++++++++++++++ arch/arm64/kernel/cpu_ops.c | 27 +++- arch/arm64/kernel/smp.c | 28 ++++ 8 files changed, 242 insertions(+), 6 deletions(-) create mode 100644 arch/arm64/kernel/acpi_parking_protocol.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ac3dbd933064..1405a1bcc26d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -740,6 +740,15 @@ endmenu menu "Boot options" +config ARM64_ACPI_PARKING_PROTOCOL + bool "Enable support for the ARM64 ACPI parking protocol" + depends on ACPI + help + Enable support for the ARM64 ACPI parking protocol. If disabled + the kernel will not allow booting through the ARM64 ACPI parking + protocol even if the corresponding data is present in the ACPI + MADT table. + config CMDLINE string "Default kernel command string" default "" diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index caafd63b8092..aee323b13802 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -87,9 +87,26 @@ void __init acpi_init_cpus(void); static inline void acpi_init_cpus(void) { } #endif /* CONFIG_ACPI */ +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +bool acpi_parking_protocol_valid(int cpu); +void __init +acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor); +#else +static inline bool acpi_parking_protocol_valid(int cpu) { return false; } +static inline void +acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor) +{} +#endif + static inline const char *acpi_get_enable_method(int cpu) { - return acpi_psci_present() ? "psci" : NULL; + if (acpi_psci_present()) + return "psci"; + + if (acpi_parking_protocol_valid(cpu)) + return "parking-protocol"; + + return NULL; } #ifdef CONFIG_ACPI_APEI diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h index a57601f9d17c..8740297dac77 100644 --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h @@ -20,7 +20,7 @@ #include #include -#define NR_IPI 5 +#define NR_IPI 6 typedef struct { unsigned int __softirq_pending; diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index d9c3d6a6100a..2013a4dc5124 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -64,6 +64,15 @@ extern void secondary_entry(void); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask); +#else +static inline void arch_send_wakeup_ipi_mask(const struct cpumask *mask) +{ + BUILD_BUG(); +} +#endif + extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 474691f8b13a..c4e2f70c0aa0 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -41,6 +41,7 @@ arm64-obj-$(CONFIG_EFI) += efi.o efi-entry.stub.o arm64-obj-$(CONFIG_PCI) += pci.o arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o arm64-obj-$(CONFIG_ACPI) += acpi.o +arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c new file mode 100644 index 000000000000..4b1e5a7a98da --- /dev/null +++ b/arch/arm64/kernel/acpi_parking_protocol.c @@ -0,0 +1,153 @@ +/* + * ARM64 ACPI Parking Protocol implementation + * + * Authors: Lorenzo Pieralisi + * Mark Salter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include +#include + +#include + +struct cpu_mailbox_entry { + phys_addr_t mailbox_addr; + u8 version; + u8 gic_cpu_id; +}; + +static struct cpu_mailbox_entry cpu_mailbox_entries[NR_CPUS]; + +void __init acpi_set_mailbox_entry(int cpu, + struct acpi_madt_generic_interrupt *p) +{ + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + + cpu_entry->mailbox_addr = p->parked_address; + cpu_entry->version = p->parking_version; + cpu_entry->gic_cpu_id = p->cpu_interface_number; +} + +bool acpi_parking_protocol_valid(int cpu) +{ + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + + return cpu_entry->mailbox_addr && cpu_entry->version; +} + +static int acpi_parking_protocol_cpu_init(unsigned int cpu) +{ + pr_debug("%s: ACPI parked addr=%llx\n", __func__, + cpu_mailbox_entries[cpu].mailbox_addr); + + return 0; +} + +static int acpi_parking_protocol_cpu_prepare(unsigned int cpu) +{ + return 0; +} + +struct parking_protocol_mailbox { + __le32 cpu_id; + __le32 reserved; + __le64 entry_point; +}; + +static int acpi_parking_protocol_cpu_boot(unsigned int cpu) +{ + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + struct parking_protocol_mailbox __iomem *mailbox; + __le32 cpu_id; + + /* + * Map mailbox memory with attribute device nGnRE (ie ioremap - + * this deviates from the parking protocol specifications since + * the mailboxes are required to be mapped nGnRnE; the attribute + * discrepancy is harmless insofar as the protocol specification + * is concerned). + * If the mailbox is mistakenly allocated in the linear mapping + * by FW ioremap will fail since the mapping will be prevented + * by the kernel (it clashes with the linear mapping attributes + * specifications). + */ + mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox)); + if (!mailbox) + return -EIO; + + cpu_id = readl_relaxed(&mailbox->cpu_id); + /* + * Check if firmware has set-up the mailbox entry properly + * before kickstarting the respective cpu. + */ + if (cpu_id != ~0U) { + iounmap(mailbox); + return -ENXIO; + } + + /* + * We write the entry point and cpu id as LE regardless of the + * native endianness of the kernel. Therefore, any boot-loaders + * that read this address need to convert this address to the + * Boot-Loader's endianness before jumping. + */ + writeq_relaxed(__pa(secondary_entry), &mailbox->entry_point); + writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id); + + arch_send_wakeup_ipi_mask(cpumask_of(cpu)); + + iounmap(mailbox); + + return 0; +} + +static void acpi_parking_protocol_cpu_postboot(void) +{ + int cpu = smp_processor_id(); + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + struct parking_protocol_mailbox __iomem *mailbox; + __le64 entry_point; + + /* + * Map mailbox memory with attribute device nGnRE (ie ioremap - + * this deviates from the parking protocol specifications since + * the mailboxes are required to be mapped nGnRnE; the attribute + * discrepancy is harmless insofar as the protocol specification + * is concerned). + * If the mailbox is mistakenly allocated in the linear mapping + * by FW ioremap will fail since the mapping will be prevented + * by the kernel (it clashes with the linear mapping attributes + * specifications). + */ + mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox)); + if (!mailbox) + return; + + entry_point = readl_relaxed(&mailbox->entry_point); + /* + * Check if firmware has cleared the entry_point as expected + * by the protocol specification. + */ + WARN_ON(entry_point); + + iounmap(mailbox); +} + +const struct cpu_operations acpi_parking_protocol_ops = { + .name = "parking-protocol", + .cpu_init = acpi_parking_protocol_cpu_init, + .cpu_prepare = acpi_parking_protocol_cpu_prepare, + .cpu_boot = acpi_parking_protocol_cpu_boot, + .cpu_postboot = acpi_parking_protocol_cpu_postboot +}; diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c index b6bd7d447768..c7cfb8fe06f9 100644 --- a/arch/arm64/kernel/cpu_ops.c +++ b/arch/arm64/kernel/cpu_ops.c @@ -25,19 +25,30 @@ #include extern const struct cpu_operations smp_spin_table_ops; +extern const struct cpu_operations acpi_parking_protocol_ops; extern const struct cpu_operations cpu_psci_ops; const struct cpu_operations *cpu_ops[NR_CPUS]; -static const struct cpu_operations *supported_cpu_ops[] __initconst = { +static const struct cpu_operations *dt_supported_cpu_ops[] __initconst = { &smp_spin_table_ops, &cpu_psci_ops, NULL, }; +static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = { +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL + &acpi_parking_protocol_ops, +#endif + &cpu_psci_ops, + NULL, +}; + static const struct cpu_operations * __init cpu_get_ops(const char *name) { - const struct cpu_operations **ops = supported_cpu_ops; + const struct cpu_operations **ops; + + ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops; while (*ops) { if (!strcmp(name, (*ops)->name)) @@ -75,8 +86,16 @@ static const char *__init cpu_read_enable_method(int cpu) } } else { enable_method = acpi_get_enable_method(cpu); - if (!enable_method) - pr_err("Unsupported ACPI enable-method\n"); + if (!enable_method) { + /* + * In ACPI systems the boot CPU does not require + * checking the enable method since for some + * boot protocol (ie parking protocol) it need not + * be initialized. Don't warn spuriously. + */ + if (cpu != 0) + pr_err("Unsupported ACPI enable-method\n"); + } } return enable_method; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 68e7f79630d4..24cb4f800033 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -70,6 +70,7 @@ enum ipi_msg_type { IPI_CPU_STOP, IPI_TIMER, IPI_IRQ_WORK, + IPI_WAKEUP }; /* @@ -443,6 +444,17 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) /* map the logical cpu id to cpu MPIDR */ cpu_logical_map(cpu_count) = hwid; + /* + * Set-up the ACPI parking protocol cpu entries + * while initializing the cpu_logical_map to + * avoid parsing MADT entries multiple times for + * nothing (ie a valid cpu_logical_map entry should + * contain a valid parking protocol data set to + * initialize the cpu if the parking protocol is + * the only available enable method). + */ + acpi_set_mailbox_entry(cpu_count, processor); + cpu_count++; } @@ -625,6 +637,7 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = { S(IPI_CPU_STOP, "CPU stop interrupts"), S(IPI_TIMER, "Timer broadcast interrupts"), S(IPI_IRQ_WORK, "IRQ work interrupts"), + S(IPI_WAKEUP, "CPU wake-up interrupts"), }; static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) @@ -668,6 +681,13 @@ void arch_send_call_function_single_ipi(int cpu) smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC); } +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +void arch_send_wakeup_ipi_mask(const struct cpumask *mask) +{ + smp_cross_call(mask, IPI_WAKEUP); +} +#endif + #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { @@ -745,6 +765,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs) break; #endif +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL + case IPI_WAKEUP: + WARN_ONCE(!acpi_parking_protocol_valid(cpu), + "CPU%u: Wake-up IPI outside the ACPI parking protocol\n", + cpu); + break; +#endif + default: pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); break; From 0b0254ded773d1f241b1ae7c7fdb137a72f60911 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 27 Jan 2016 10:50:19 +0100 Subject: [PATCH 0396/3220] UPSTREAM: arm64: allow vmalloc regions to be set with set_memory_* The range of set_memory_* is currently restricted to the module address range because of difficulties in breaking down larger block sizes. vmalloc maps PAGE_SIZE pages so it is safe to use as well. Update the function ranges and add a comment explaining why the range is restricted the way it is. Suggested-by: Laura Abbott Acked-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 95f5c80050ad723163aa80dc8bffd48ef4afc6d5) Signed-off-by: Jeff Vander Stoep Change-Id: I80117bfd93ffe90a8edc89cbf6f456d9423bcf73 --- arch/arm64/mm/pageattr.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 3571c7309c5e..29ee7d85ca4c 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,7 @@ static int change_memory_common(unsigned long addr, int numpages, unsigned long end = start + size; int ret; struct page_change_data data; + struct vm_struct *area; if (!PAGE_ALIGNED(addr)) { start &= PAGE_MASK; @@ -51,10 +53,23 @@ static int change_memory_common(unsigned long addr, int numpages, WARN_ON_ONCE(1); } - if (start < MODULES_VADDR || start >= MODULES_END) - return -EINVAL; - - if (end < MODULES_VADDR || end >= MODULES_END) + /* + * Kernel VA mappings are always live, and splitting live section + * mappings into page mappings may cause TLB conflicts. This means + * we have to ensure that changing the permission bits of the range + * we are operating on does not result in such splitting. + * + * Let's restrict ourselves to mappings created by vmalloc (or vmap). + * Those are guaranteed to consist entirely of page mappings, and + * splitting is never needed. + * + * So check whether the [addr, addr + size) interval is entirely + * covered by precisely one VM area that has the VM_ALLOC flag set. + */ + area = find_vm_area((void *)addr); + if (!area || + end > (unsigned long)area->addr + area->size || + !(area->flags & VM_ALLOC)) return -EINVAL; data.set_mask = set_mask; From 58ffa07081e1aabd092257eb6f2fb718dcd3d486 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Feb 2016 12:46:23 +0000 Subject: [PATCH 0397/3220] UPSTREAM: arm64: prefetch: don't provide spin_lock_prefetch with LSE The LSE atomics rely on us not dirtying data at L1 if we can avoid it, otherwise many of the potential scalability benefits are lost. This patch replaces spin_lock_prefetch with a nop when the LSE atomics are in use, so that users don't shoot themselves in the foot by causing needless coherence traffic at L1. Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit cd5e10bdf3795d22f10787bb1991c43798c885d5) Signed-off-by: Jeff Vander Stoep Change-Id: Ib8bc3f38d9306c13e017139ae4f2a7c8d6b61e18 --- arch/arm64/include/asm/processor.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 4acb7ca94fcd..31b76fce4477 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -29,6 +29,7 @@ #include +#include #include #include #include @@ -177,9 +178,11 @@ static inline void prefetchw(const void *ptr) } #define ARCH_HAS_SPINLOCK_PREFETCH -static inline void spin_lock_prefetch(const void *x) +static inline void spin_lock_prefetch(const void *ptr) { - prefetchw(x); + asm volatile(ARM64_LSE_ATOMIC_INSN( + "prfm pstl1strm, %a0", + "nop") : : "p" (ptr)); } #define HAVE_ARCH_PICK_MMAP_LAYOUT From b5924f376a221ffa0ee64a4410aff54416c2fd2d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Feb 2016 12:46:24 +0000 Subject: [PATCH 0398/3220] UPSTREAM: arm64: prefetch: add alternative pattern for CPUs without a prefetcher Most CPUs have a hardware prefetcher which generally performs better without explicit prefetch instructions issued by software, however some CPUs (e.g. Cavium ThunderX) rely solely on explicit prefetch instructions. This patch adds an alternative pattern (ARM64_HAS_NO_HW_PREFETCH) to allow our library code to make use of explicit prefetch instructions during things like copy routines only when the CPU does not have the capability to perform the prefetching itself. Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit d5370f754875460662abe8561388e019d90dd0c4) Signed-off-by: Jeff Vander Stoep Change-Id: Ie33097c9b7786922ff8c457d16515c3188b8e94b --- arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/include/asm/cputype.h | 17 ++++++++++++++++- arch/arm64/kernel/cpu_errata.c | 18 +++--------------- arch/arm64/kernel/cpufeature.c | 17 +++++++++++++++++ 4 files changed, 38 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8f271b83f910..8d56bd8550dc 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -30,8 +30,9 @@ #define ARM64_HAS_LSE_ATOMICS 5 #define ARM64_WORKAROUND_CAVIUM_23154 6 #define ARM64_WORKAROUND_834220 7 +#define ARM64_HAS_NO_HW_PREFETCH 8 -#define ARM64_NCAPS 8 +#define ARM64_NCAPS 9 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 1a5949364ed0..7540284a17fe 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -57,11 +57,22 @@ #define MIDR_IMPLEMENTOR(midr) \ (((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT) -#define MIDR_CPU_PART(imp, partnum) \ +#define MIDR_CPU_MODEL(imp, partnum) \ (((imp) << MIDR_IMPLEMENTOR_SHIFT) | \ (0xf << MIDR_ARCHITECTURE_SHIFT) | \ ((partnum) << MIDR_PARTNUM_SHIFT)) +#define MIDR_CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \ + MIDR_ARCHITECTURE_MASK) + +#define MIDR_IS_CPU_MODEL_RANGE(midr, model, rv_min, rv_max) \ +({ \ + u32 _model = (midr) & MIDR_CPU_MODEL_MASK; \ + u32 rv = (midr) & (MIDR_REVISION_MASK | MIDR_VARIANT_MASK); \ + \ + _model == (model) && rv >= (rv_min) && rv <= (rv_max); \ + }) + #define ARM_CPU_IMP_ARM 0x41 #define ARM_CPU_IMP_APM 0x50 #define ARM_CPU_IMP_CAVIUM 0x43 @@ -75,6 +86,10 @@ #define CAVIUM_CPU_PART_THUNDERX 0x0A1 +#define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) +#define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) +#define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) + #ifndef __ASSEMBLY__ /* diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index feb6b4efa641..e6bc988e8dbf 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -21,24 +21,12 @@ #include #include -#define MIDR_CORTEX_A53 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) -#define MIDR_CORTEX_A57 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) -#define MIDR_THUNDERX MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) - -#define CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \ - MIDR_ARCHITECTURE_MASK) - static bool __maybe_unused is_affected_midr_range(const struct arm64_cpu_capabilities *entry) { - u32 midr = read_cpuid_id(); - - if ((midr & CPU_MODEL_MASK) != entry->midr_model) - return false; - - midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK; - - return (midr >= entry->midr_range_min && midr <= entry->midr_range_max); + return MIDR_IS_CPU_MODEL_RANGE(read_cpuid_id(), entry->midr_model, + entry->midr_range_min, + entry->midr_range_max); } #define MIDR_RANGE(model, min, max) \ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5c90aa490a2b..3615d7d7c9af 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -621,6 +621,18 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry) return has_sre; } +static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry) +{ + u32 midr = read_cpuid_id(); + u32 rv_min, rv_max; + + /* Cavium ThunderX pass 1.x and 2.x */ + rv_min = 0; + rv_max = (1 << MIDR_VARIANT_SHIFT) | MIDR_REVISION_MASK; + + return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, rv_min, rv_max); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -651,6 +663,11 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 2, }, #endif /* CONFIG_AS_LSE && CONFIG_ARM64_LSE_ATOMICS */ + { + .desc = "Software prefetching using PRFM", + .capability = ARM64_HAS_NO_HW_PREFETCH, + .matches = has_no_hw_prefetch, + }, {}, }; From 297bf83fa4e7804c28c6d950f39dbd3860552cb9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Feb 2016 12:46:25 +0000 Subject: [PATCH 0399/3220] UPSTREAM: arm64: lib: improve copy_page to deal with 128 bytes at a time We want to avoid lots of different copy_page implementations, settling for something that is "good enough" everywhere and hopefully easy to understand and maintain whilst we're at it. This patch reworks our copy_page implementation based on discussions with Cavium on the list and benchmarking on Cortex-A processors so that: - The loop is unrolled to copy 128 bytes per iteration - The reads are offset so that we read from the next 128-byte block in the same iteration that we store the previous block - Explicit prefetch instructions are removed for now, since they hurt performance on CPUs with hardware prefetching - The loop exit condition is calculated at the start of the loop Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 223e23e8aa26b0bb62c597637e77295e14f6a62c) Signed-off-by: Jeff Vander Stoep Change-Id: Icabd86bbecc60ad0d730ab796e33b8762cecb1fb --- arch/arm64/lib/copy_page.S | 46 +++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 512b9a7b980e..2534533ceb1d 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -27,20 +27,50 @@ * x1 - src */ ENTRY(copy_page) - /* Assume cache line size is 64 bytes. */ - prfm pldl1strm, [x1, #64] -1: ldp x2, x3, [x1] + ldp x2, x3, [x1] ldp x4, x5, [x1, #16] ldp x6, x7, [x1, #32] ldp x8, x9, [x1, #48] - add x1, x1, #64 - prfm pldl1strm, [x1, #64] + ldp x10, x11, [x1, #64] + ldp x12, x13, [x1, #80] + ldp x14, x15, [x1, #96] + ldp x16, x17, [x1, #112] + + mov x18, #(PAGE_SIZE - 128) + add x1, x1, #128 +1: + subs x18, x18, #128 + + stnp x2, x3, [x0] + ldp x2, x3, [x1] + stnp x4, x5, [x0, #16] + ldp x4, x5, [x1, #16] + stnp x6, x7, [x0, #32] + ldp x6, x7, [x1, #32] + stnp x8, x9, [x0, #48] + ldp x8, x9, [x1, #48] + stnp x10, x11, [x0, #64] + ldp x10, x11, [x1, #64] + stnp x12, x13, [x0, #80] + ldp x12, x13, [x1, #80] + stnp x14, x15, [x0, #96] + ldp x14, x15, [x1, #96] + stnp x16, x17, [x0, #112] + ldp x16, x17, [x1, #112] + + add x0, x0, #128 + add x1, x1, #128 + + b.gt 1b + stnp x2, x3, [x0] stnp x4, x5, [x0, #16] stnp x6, x7, [x0, #32] stnp x8, x9, [x0, #48] - add x0, x0, #64 - tst x1, #(PAGE_SIZE - 1) - b.ne 1b + stnp x10, x11, [x0, #64] + stnp x12, x13, [x0, #80] + stnp x14, x15, [x0, #96] + stnp x16, x17, [x0, #112] + ret ENDPROC(copy_page) From 6e1f8e3c0fb86a90fe98667cfa1be71957534c66 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Tue, 2 Feb 2016 12:46:26 +0000 Subject: [PATCH 0400/3220] UPSTREAM: arm64: lib: patch in prfm for copy_page if requested On ThunderX T88 pass 1 and pass 2, there is no hardware prefetching so we need to patch in explicit software prefetching instructions Prefetching improves this code by 60% over the original code and 2x over the code without prefetching for the affected hardware using the benchmark code at https://github.com/apinski-cavium/copy_page_benchmark Signed-off-by: Andrew Pinski Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 60e0a09db24adc8809696307e5d97cc4ba7cb3e0) Signed-off-by: Jeff Vander Stoep Change-Id: I3821a4d3a7b6fd68b4b0aca31478ec960e4e5172 --- arch/arm64/lib/copy_page.S | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 2534533ceb1d..4c1e700840b6 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -18,6 +18,8 @@ #include #include #include +#include +#include /* * Copy a page from src to dest (both are page aligned) @@ -27,6 +29,15 @@ * x1 - src */ ENTRY(copy_page) +alternative_if_not ARM64_HAS_NO_HW_PREFETCH + nop + nop +alternative_else + # Prefetch two cache lines ahead. + prfm pldl1strm, [x1, #128] + prfm pldl1strm, [x1, #256] +alternative_endif + ldp x2, x3, [x1] ldp x4, x5, [x1, #16] ldp x6, x7, [x1, #32] @@ -41,6 +52,12 @@ ENTRY(copy_page) 1: subs x18, x18, #128 +alternative_if_not ARM64_HAS_NO_HW_PREFETCH + nop +alternative_else + prfm pldl1strm, [x1, #384] +alternative_endif + stnp x2, x3, [x0] ldp x2, x3, [x1] stnp x4, x5, [x0, #16] From f54fba19ed7891dc214085f55d73990f19e9c756 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 10 Feb 2016 10:07:30 +0000 Subject: [PATCH 0401/3220] UPSTREAM: arm64: prefetch: add missing #include for spin_lock_prefetch As of 52e662326e1e ("arm64: prefetch: don't provide spin_lock_prefetch with LSE"), spin_lock_prefetch is patched at runtime when the LSE atomics are in use. This relies on the ARM64_LSE_ATOMIC_INSN macro to drive the alternatives framework, but that macro is only available via asm/lse.h, which isn't explicitly included in processor.h. Consequently, drivers can run into build failures such as: In file included from include/linux/prefetch.h:14:0, from drivers/net/ethernet/intel/i40e/i40e_txrx.c:27: arch/arm64/include/asm/processor.h: In function 'spin_lock_prefetch': arch/arm64/include/asm/processor.h:183:15: error: expected string literal before 'ARM64_LSE_ATOMIC_INSN' asm volatile(ARM64_LSE_ATOMIC_INSN( This patch add the missing include and gets things building again. Reported-by: kbuild test robot Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit afb83cc3f0e4f86ea0e1cc3db7a90f58f1abd4d5) Signed-off-by: Jeff Vander Stoep Change-Id: I9a9e6b920f96b060781a01bff50a0ca8db78cf7f --- arch/arm64/include/asm/processor.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 31b76fce4477..5bb1d763d17a 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include From a8a3d94dbb8ef81a8710b4fec7dfd1a07f2647f9 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 5 Feb 2016 16:24:46 -0800 Subject: [PATCH 0402/3220] UPSTREAM: arm64: Drop alloc function from create_mapping create_mapping is only used in fixmap_remap_fdt. All the create_mapping calls need to happen on existing translation table pages without additional allocations. Rather than have an alloc function be called and fail, just set it to NULL and catch its use. Also change the name to create_mapping_noalloc to better capture what exactly is going on. Reviewed-by: Ard Biesheuvel Reviewed-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 132233a759580f5ce9b1bfaac9073e47d03c460d) Signed-off-by: Jeff Vander Stoep Change-Id: I48db9adae540e79d47328b4045f26f22ecebc482 --- arch/arm64/mm/mmu.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 6f5770a1b140..b4788d3e1551 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -116,7 +116,9 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, pte_t *pte; if (pmd_none(*pmd) || pmd_sect(*pmd)) { - phys_addr_t pte_phys = pgtable_alloc(); + phys_addr_t pte_phys; + BUG_ON(!pgtable_alloc); + pte_phys = pgtable_alloc(); pte = pte_set_fixmap(pte_phys); if (pmd_sect(*pmd)) split_pmd(pmd, pte); @@ -158,7 +160,9 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, * Check for initial section mappings in the pgd/pud and remove them. */ if (pud_none(*pud) || pud_sect(*pud)) { - phys_addr_t pmd_phys = pgtable_alloc(); + phys_addr_t pmd_phys; + BUG_ON(!pgtable_alloc); + pmd_phys = pgtable_alloc(); pmd = pmd_set_fixmap(pmd_phys); if (pud_sect(*pud)) { /* @@ -223,7 +227,9 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long next; if (pgd_none(*pgd)) { - phys_addr_t pud_phys = pgtable_alloc(); + phys_addr_t pud_phys; + BUG_ON(!pgtable_alloc); + pud_phys = pgtable_alloc(); __pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE); } BUG_ON(pgd_bad(*pgd)); @@ -312,7 +318,12 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc); } -static void __init create_mapping(phys_addr_t phys, unsigned long virt, +/* + * This function can only be used to modify existing table entries, + * without allocating new levels of table. Note that this permits the + * creation of new section or page entries. + */ +static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { if (virt < VMALLOC_START) { @@ -321,7 +332,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, return; } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - early_pgtable_alloc); + NULL); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, @@ -678,7 +689,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) /* * Make sure that the FDT region can be mapped without the need to * allocate additional translation table pages, so that it is safe - * to call create_mapping() this early. + * to call create_mapping_noalloc() this early. * * On 64k pages, the FDT will be mapped using PTEs, so we need to * be in the same PMD as the rest of the fixmap. @@ -694,8 +705,8 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) dt_virt = (void *)dt_virt_base + offset; /* map the first chunk so we can read the size from the header */ - create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, - SWAPPER_BLOCK_SIZE, prot); + create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), + dt_virt_base, SWAPPER_BLOCK_SIZE, prot); if (fdt_check_header(dt_virt) != 0) return NULL; @@ -705,7 +716,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return NULL; if (offset + size > SWAPPER_BLOCK_SIZE) - create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, + create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, round_up(offset + size, SWAPPER_BLOCK_SIZE), prot); memblock_reserve(dt_phys, size); From 644210f3f890d1e5481378d28d150d1c5f4191fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Penttil=C3=A4?= Date: Tue, 26 Jan 2016 15:47:25 +0000 Subject: [PATCH 0403/3220] UPSTREAM: arm64: mm: avoid calling apply_to_page_range on empty range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling apply_to_page_range with an empty range results in a BUG_ON from the core code. This can be triggered by trying to load the st_drv module with CONFIG_DEBUG_SET_MODULE_RONX enabled: kernel BUG at mm/memory.c:1874! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 3 PID: 1764 Comm: insmod Not tainted 4.5.0-rc1+ #2 Hardware name: ARM Juno development board (r0) (DT) task: ffffffc9763b8000 ti: ffffffc975af8000 task.ti: ffffffc975af8000 PC is at apply_to_page_range+0x2cc/0x2d0 LR is at change_memory_common+0x80/0x108 This patch fixes the issue by making change_memory_common (called by the set_memory_* functions) a NOP when numpages == 0, therefore avoiding the erroneous call to apply_to_page_range and bringing us into line with x86 and s390. Cc: Reviewed-by: Laura Abbott Acked-by: David Rientjes Signed-off-by: Mika Penttilä Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 57adec866c0440976c96a4b8f5b59fb411b1cacb) Signed-off-by: Jeff Vander Stoep Change-Id: Ia107d3b324cc8237f669778a7c9c3abae8637501 --- arch/arm64/mm/pageattr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 29ee7d85ca4c..0795c3a36d8f 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -72,6 +72,9 @@ static int change_memory_common(unsigned long addr, int numpages, !(area->flags & VM_ALLOC)) return -EINVAL; + if (!numpages) + return 0; + data.set_mask = set_mask; data.clear_mask = clear_mask; From 664e159255fe7afc89145b4eed47b370d01489fa Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 5 Feb 2016 16:24:47 -0800 Subject: [PATCH 0404/3220] UPSTREAM: arm64: Add support for ARCH_SUPPORTS_DEBUG_PAGEALLOC ARCH_SUPPORTS_DEBUG_PAGEALLOC provides a hook to map and unmap pages for debugging purposes. This requires memory be mapped with PAGE_SIZE mappings since breaking down larger mappings at runtime will lead to TLB conflicts. Check if debug_pagealloc is enabled at runtime and if so, map everyting with PAGE_SIZE pages. Implement the functions to actually map/unmap the pages at runtime. Reviewed-by: Ard Biesheuvel Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Laura Abbott [catalin.marinas@arm.com: static annotation block_mappings_allowed() and #ifdef] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 83863f25e4b8214e994ef8b5647aad614d74b45d) Signed-off-by: Jeff Vander Stoep Change-Id: Ia9b9bc5fcc1938bafbc2930f08d27973cfc3d038 --- arch/arm64/Kconfig | 3 +++ arch/arm64/mm/mmu.c | 26 +++++++++++++++++++++-- arch/arm64/mm/pageattr.c | 46 +++++++++++++++++++++++++++++++--------- 3 files changed, 63 insertions(+), 12 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1405a1bcc26d..49988da6f342 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -541,6 +541,9 @@ config HOTPLUG_CPU source kernel/Kconfig.preempt source kernel/Kconfig.hz +config ARCH_SUPPORTS_DEBUG_PAGEALLOC + def_bool y + config ARCH_HAS_HOLES_MEMORYMODEL def_bool y if SPARSEMEM diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b4788d3e1551..8d0e49d2cba9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -149,6 +149,26 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) } while (pmd++, i++, i < PTRS_PER_PMD); } +#ifdef CONFIG_DEBUG_PAGEALLOC +static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void)) +{ + + /* + * If debug_page_alloc is enabled we must map the linear map + * using pages. However, other mappings created by + * create_mapping_noalloc must use sections in some cases. Allow + * sections to be used in those cases, where no pgtable_alloc + * function is provided. + */ + return !pgtable_alloc || !debug_pagealloc_enabled(); +} +#else +static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void)) +{ + return true; +} +#endif + static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) @@ -181,7 +201,8 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, do { next = pmd_addr_end(addr, end); /* try section mapping first */ - if (((addr | next | phys) & ~SECTION_MASK) == 0) { + if (((addr | next | phys) & ~SECTION_MASK) == 0 && + block_mappings_allowed(pgtable_alloc)) { pmd_t old_pmd =*pmd; set_pmd(pmd, __pmd(phys | pgprot_val(mk_sect_prot(prot)))); @@ -241,7 +262,8 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, /* * For 4K granule only, attempt to put down a 1GB block */ - if (use_1G_block(addr, next, phys)) { + if (use_1G_block(addr, next, phys) && + block_mappings_allowed(pgtable_alloc)) { pud_t old_pud = *pud; set_pud(pud, __pud(phys | pgprot_val(mk_sect_prot(prot)))); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 0795c3a36d8f..ca6d268e3313 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -37,14 +37,31 @@ static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, return 0; } +/* + * This function assumes that the range is mapped with PAGE_SIZE pages. + */ +static int __change_memory_common(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) +{ + struct page_change_data data; + int ret; + + data.set_mask = set_mask; + data.clear_mask = clear_mask; + + ret = apply_to_page_range(&init_mm, start, size, change_page_range, + &data); + + flush_tlb_kernel_range(start, start + size); + return ret; +} + static int change_memory_common(unsigned long addr, int numpages, pgprot_t set_mask, pgprot_t clear_mask) { unsigned long start = addr; unsigned long size = PAGE_SIZE*numpages; unsigned long end = start + size; - int ret; - struct page_change_data data; struct vm_struct *area; if (!PAGE_ALIGNED(addr)) { @@ -75,14 +92,7 @@ static int change_memory_common(unsigned long addr, int numpages, if (!numpages) return 0; - data.set_mask = set_mask; - data.clear_mask = clear_mask; - - ret = apply_to_page_range(&init_mm, start, size, change_page_range, - &data); - - flush_tlb_kernel_range(start, end); - return ret; + return __change_memory_common(start, size, set_mask, clear_mask); } int set_memory_ro(unsigned long addr, int numpages) @@ -114,3 +124,19 @@ int set_memory_x(unsigned long addr, int numpages) __pgprot(PTE_PXN)); } EXPORT_SYMBOL_GPL(set_memory_x); + +#ifdef CONFIG_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long addr = (unsigned long) page_address(page); + + if (enable) + __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(PTE_VALID), + __pgprot(0)); + else + __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(0), + __pgprot(PTE_VALID)); +} +#endif From 4d4c2259aa3270aceb9c55562afdf586c8328eee Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 5 Feb 2016 16:24:48 -0800 Subject: [PATCH 0405/3220] UPSTREAM: arm64: ptdump: Indicate whether memory should be faulting With CONFIG_DEBUG_PAGEALLOC, pages do not have the valid bit set when free in the buddy allocator. Add an indiciation to the page table dumping code that the valid bit is not set, 'F' for fault, to make this easier to understand. Reviewed-by: Ard Biesheuvel Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit d7e9d59494a9a5d83274f5af2148b82ca22dff3f) Signed-off-by: Jeff Vander Stoep Change-Id: I7e8200549e22cb3a01b5e827bf12b872b9cefc58 --- arch/arm64/mm/dump.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 0adbebbc2803..0841b2bf0e6a 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -90,6 +90,11 @@ struct prot_bits { static const struct prot_bits pte_bits[] = { { + .mask = PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "F", + }, { .mask = PTE_USER, .val = PTE_USER, .set = "USR", From 3aa18069d0038c0475ca598d2a9e9533ab6c28f7 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Feb 2016 15:50:18 -0800 Subject: [PATCH 0406/3220] UPSTREAM: arm64: ubsan: select ARCH_HAS_UBSAN_SANITIZE_ALL To enable UBSAN on arm64, ARCH_HAS_UBSAN_SANITIZE_ALL need to be selected. Basic kernel bootup test is passed on arm64 with CONFIG_UBSAN_SANITIZE_ALL enabled. Signed-off-by: Yang Shi Acked-by: Andrey Ryabinin Tested-by: Mark Rutland Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit f0b7f8a4b44657386273a67179dd901c81cd11a6) Signed-off-by: Jeff Vander Stoep Change-Id: I640f50e70e3562bf1caf2ce164d6ed5640e041f6 --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 49988da6f342..71ce487d7b78 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -13,6 +13,7 @@ config ARM64 select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_COMPAT_IPC_PARSE_VERSION select ARCH_WANT_FRAME_POINTERS + select ARCH_HAS_UBSAN_SANITIZE_ALL select ARM_AMBA select ARM_ARCH_TIMER select ARM_GIC From a275e00e2833f58216ed58baae6de790a4dc27cb Mon Sep 17 00:00:00 2001 From: David Brown Date: Wed, 10 Feb 2016 13:52:22 -0800 Subject: [PATCH 0407/3220] UPSTREAM: arm64: vdso: Mark vDSO code as read-only Although the arm64 vDSO is cleanly separated by code/data with the code being read-only in userspace mappings, the code page is still writable from the kernel. There have been exploits (such as http://itszn.com/blog/?p=21) that take advantage of this on x86 to go from a bad kernel write to full root. Prevent this specific exploit on arm64 by putting the vDSO code page in read-only memory as well. Before the change: [ 3.138366] vdso: 2 pages (1 code @ ffffffc000a71000, 1 data @ ffffffc000a70000) ---[ Kernel Mapping ]--- 0xffffffc000000000-0xffffffc000082000 520K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000082000-0xffffffc000200000 1528K ro x SHD AF UXN MEM/NORMAL 0xffffffc000200000-0xffffffc000800000 6M ro x SHD AF BLK UXN MEM/NORMAL 0xffffffc000800000-0xffffffc0009b6000 1752K ro x SHD AF UXN MEM/NORMAL 0xffffffc0009b6000-0xffffffc000c00000 2344K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000c00000-0xffffffc008000000 116M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc00c000000-0xffffffc07f000000 1840M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc800000000-0xffffffc840000000 1G RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc840000000-0xffffffc87ae00000 942M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87ae00000-0xffffffc87ae70000 448K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af80000-0xffffffc87af8a000 40K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af8b000-0xffffffc87b000000 468K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87b000000-0xffffffc87fe00000 78M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87fe00000-0xffffffc87ff50000 1344K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87ff90000-0xffffffc87ffa0000 64K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87fff0000-0xffffffc880000000 64K RW NX SHD AF UXN MEM/NORMAL After: [ 3.138368] vdso: 2 pages (1 code @ ffffffc0006de000, 1 data @ ffffffc000a74000) ---[ Kernel Mapping ]--- 0xffffffc000000000-0xffffffc000082000 520K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000082000-0xffffffc000200000 1528K ro x SHD AF UXN MEM/NORMAL 0xffffffc000200000-0xffffffc000800000 6M ro x SHD AF BLK UXN MEM/NORMAL 0xffffffc000800000-0xffffffc0009b8000 1760K ro x SHD AF UXN MEM/NORMAL 0xffffffc0009b8000-0xffffffc000c00000 2336K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000c00000-0xffffffc008000000 116M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc00c000000-0xffffffc07f000000 1840M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc800000000-0xffffffc840000000 1G RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc840000000-0xffffffc87ae00000 942M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87ae00000-0xffffffc87ae70000 448K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af80000-0xffffffc87af8a000 40K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af8b000-0xffffffc87b000000 468K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87b000000-0xffffffc87fe00000 78M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87fe00000-0xffffffc87ff50000 1344K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87ff90000-0xffffffc87ffa0000 64K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87fff0000-0xffffffc880000000 64K RW NX SHD AF UXN MEM/NORMAL Inspired by https://lkml.org/lkml/2016/1/19/494 based on work by the PaX Team, Brad Spengler, and Kees Cook. Signed-off-by: David Brown Acked-by: Will Deacon Acked-by: Ard Biesheuvel [catalin.marinas@arm.com: removed superfluous __PAGE_ALIGNED_DATA] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 88d8a7994e564d209d4b2583496631c2357d386b) Signed-off-by: Jeff Vander Stoep Change-Id: I3fe4b48df8b27313ac61c947746805442757932c --- arch/arm64/kernel/vdso/vdso.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S index 60c1db54b41a..82379a70ef03 100644 --- a/arch/arm64/kernel/vdso/vdso.S +++ b/arch/arm64/kernel/vdso/vdso.S @@ -21,9 +21,8 @@ #include #include - __PAGE_ALIGNED_DATA - .globl vdso_start, vdso_end + .section .rodata .balign PAGE_SIZE vdso_start: .incbin "arch/arm64/kernel/vdso/vdso.so" From de30ecfe314d3aca506ed45e0892a942ce523bd4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Feb 2016 09:51:49 +0100 Subject: [PATCH 0408/3220] UPSTREAM: arm64: use local label prefixes for __reg_num symbols The __reg_num_xNN symbols that are used to implement the msr_s and mrs_s macros are recorded in the ELF metadata of each object file. This does not affect the size of the final binary, but it does clutter the output of tools like readelf, i.e., $ readelf -a vmlinux |grep -c __reg_num_x 50976 So let's use symbols with the .L prefix, these are strictly local, and don't end up in the object files. $ readelf -a vmlinux |grep -c __reg_num_x 0 Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 7abc7d833c9eb16efc8a59239d3771a6e30be367) Signed-off-by: Jeff Vander Stoep Change-Id: Idefe9841ef7d1ddcc5161fc8de14153cfadaf4f3 --- arch/arm64/include/asm/sysreg.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index d48ab5b41f52..76907c94b11f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -194,32 +194,32 @@ #ifdef __ASSEMBLY__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 - .equ __reg_num_x\num, \num + .equ .L__reg_num_x\num, \num .endr - .equ __reg_num_xzr, 31 + .equ .L__reg_num_xzr, 31 .macro mrs_s, rt, sreg - .inst 0xd5200000|(\sreg)|(__reg_num_\rt) + .inst 0xd5200000|(\sreg)|(.L__reg_num_\rt) .endm .macro msr_s, sreg, rt - .inst 0xd5000000|(\sreg)|(__reg_num_\rt) + .inst 0xd5000000|(\sreg)|(.L__reg_num_\rt) .endm #else asm( " .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" -" .equ __reg_num_x\\num, \\num\n" +" .equ .L__reg_num_x\\num, \\num\n" " .endr\n" -" .equ __reg_num_xzr, 31\n" +" .equ .L__reg_num_xzr, 31\n" "\n" " .macro mrs_s, rt, sreg\n" -" .inst 0xd5200000|(\\sreg)|(__reg_num_\\rt)\n" +" .inst 0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n" " .endm\n" "\n" " .macro msr_s, sreg, rt\n" -" .inst 0xd5000000|(\\sreg)|(__reg_num_\\rt)\n" +" .inst 0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n" " .endm\n" ); From ac3060aed4ea0173b093c474fbd8981c4e1dc274 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:46 +0000 Subject: [PATCH 0409/3220] UPSTREAM: arm64: cpufeature: Change read_cpuid() to use sysreg's mrs_s macro Older assemblers may not have support for newer feature registers. To get round this, sysreg.h provides a 'mrs_s' macro that takes a register encoding and generates the raw instruction. Change read_cpuid() to use mrs_s in all cases so that new registers don't have to be a special case. Including sysreg.h means we need to move the include and definition of read_cpuid() after the #ifndef __ASSEMBLY__ to avoid syntax errors in vmlinux.lds. Signed-off-by: James Morse Acked-by: Mark Rutland Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 0f54b14e76f5302afe164dc911b049b5df836ff5) Signed-off-by: Jeff Vander Stoep Change-Id: Ic59c81a97b4585b3c0964f293ddee08f7cb594ac --- arch/arm64/include/asm/cpufeature.h | 2 +- arch/arm64/include/asm/cputype.h | 20 ++++++----- arch/arm64/kernel/cpufeature.c | 54 ++++++++++++++--------------- arch/arm64/kernel/cpuinfo.c | 50 +++++++++++++------------- arch/arm64/mm/context.c | 2 +- 5 files changed, 65 insertions(+), 63 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8d56bd8550dc..8131abfabb0a 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -177,7 +177,7 @@ u64 read_system_reg(u32 id); static inline bool cpu_supports_mixed_endian_el0(void) { - return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1)); + return id_aa64mmfr0_mixed_endian_el0(read_cpuid(SYS_ID_AA64MMFR0_EL1)); } static inline bool system_supports_mixed_endian_el0(void) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 7540284a17fe..b3a83da152a7 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -32,12 +32,6 @@ #define MPIDR_AFFINITY_LEVEL(mpidr, level) \ ((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK) -#define read_cpuid(reg) ({ \ - u64 __val; \ - asm("mrs %0, " #reg : "=r" (__val)); \ - __val; \ -}) - #define MIDR_REVISION_MASK 0xf #define MIDR_REVISION(midr) ((midr) & MIDR_REVISION_MASK) #define MIDR_PARTNUM_SHIFT 4 @@ -92,6 +86,14 @@ #ifndef __ASSEMBLY__ +#include + +#define read_cpuid(reg) ({ \ + u64 __val; \ + asm("mrs_s %0, " __stringify(reg) : "=r" (__val)); \ + __val; \ +}) + /* * The CPU ID never changes at run time, so we might as well tell the * compiler that it's constant. Use this function to read the CPU ID @@ -99,12 +101,12 @@ */ static inline u32 __attribute_const__ read_cpuid_id(void) { - return read_cpuid(MIDR_EL1); + return read_cpuid(SYS_MIDR_EL1); } static inline u64 __attribute_const__ read_cpuid_mpidr(void) { - return read_cpuid(MPIDR_EL1); + return read_cpuid(SYS_MPIDR_EL1); } static inline unsigned int __attribute_const__ read_cpuid_implementor(void) @@ -119,7 +121,7 @@ static inline unsigned int __attribute_const__ read_cpuid_part_number(void) static inline u32 __attribute_const__ read_cpuid_cachetype(void) { - return read_cpuid(CTR_EL0); + return read_cpuid(SYS_CTR_EL0); } #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 3615d7d7c9af..1ef10e784031 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -808,35 +808,35 @@ static inline void set_sys_caps_initialised(void) static u64 __raw_read_system_reg(u32 sys_id) { switch (sys_id) { - case SYS_ID_PFR0_EL1: return (u64)read_cpuid(ID_PFR0_EL1); - case SYS_ID_PFR1_EL1: return (u64)read_cpuid(ID_PFR1_EL1); - case SYS_ID_DFR0_EL1: return (u64)read_cpuid(ID_DFR0_EL1); - case SYS_ID_MMFR0_EL1: return (u64)read_cpuid(ID_MMFR0_EL1); - case SYS_ID_MMFR1_EL1: return (u64)read_cpuid(ID_MMFR1_EL1); - case SYS_ID_MMFR2_EL1: return (u64)read_cpuid(ID_MMFR2_EL1); - case SYS_ID_MMFR3_EL1: return (u64)read_cpuid(ID_MMFR3_EL1); - case SYS_ID_ISAR0_EL1: return (u64)read_cpuid(ID_ISAR0_EL1); - case SYS_ID_ISAR1_EL1: return (u64)read_cpuid(ID_ISAR1_EL1); - case SYS_ID_ISAR2_EL1: return (u64)read_cpuid(ID_ISAR2_EL1); - case SYS_ID_ISAR3_EL1: return (u64)read_cpuid(ID_ISAR3_EL1); - case SYS_ID_ISAR4_EL1: return (u64)read_cpuid(ID_ISAR4_EL1); - case SYS_ID_ISAR5_EL1: return (u64)read_cpuid(ID_ISAR4_EL1); - case SYS_MVFR0_EL1: return (u64)read_cpuid(MVFR0_EL1); - case SYS_MVFR1_EL1: return (u64)read_cpuid(MVFR1_EL1); - case SYS_MVFR2_EL1: return (u64)read_cpuid(MVFR2_EL1); + case SYS_ID_PFR0_EL1: return read_cpuid(SYS_ID_PFR0_EL1); + case SYS_ID_PFR1_EL1: return read_cpuid(SYS_ID_PFR1_EL1); + case SYS_ID_DFR0_EL1: return read_cpuid(SYS_ID_DFR0_EL1); + case SYS_ID_MMFR0_EL1: return read_cpuid(SYS_ID_MMFR0_EL1); + case SYS_ID_MMFR1_EL1: return read_cpuid(SYS_ID_MMFR1_EL1); + case SYS_ID_MMFR2_EL1: return read_cpuid(SYS_ID_MMFR2_EL1); + case SYS_ID_MMFR3_EL1: return read_cpuid(SYS_ID_MMFR3_EL1); + case SYS_ID_ISAR0_EL1: return read_cpuid(SYS_ID_ISAR0_EL1); + case SYS_ID_ISAR1_EL1: return read_cpuid(SYS_ID_ISAR1_EL1); + case SYS_ID_ISAR2_EL1: return read_cpuid(SYS_ID_ISAR2_EL1); + case SYS_ID_ISAR3_EL1: return read_cpuid(SYS_ID_ISAR3_EL1); + case SYS_ID_ISAR4_EL1: return read_cpuid(SYS_ID_ISAR4_EL1); + case SYS_ID_ISAR5_EL1: return read_cpuid(SYS_ID_ISAR4_EL1); + case SYS_MVFR0_EL1: return read_cpuid(SYS_MVFR0_EL1); + case SYS_MVFR1_EL1: return read_cpuid(SYS_MVFR1_EL1); + case SYS_MVFR2_EL1: return read_cpuid(SYS_MVFR2_EL1); - case SYS_ID_AA64PFR0_EL1: return (u64)read_cpuid(ID_AA64PFR0_EL1); - case SYS_ID_AA64PFR1_EL1: return (u64)read_cpuid(ID_AA64PFR0_EL1); - case SYS_ID_AA64DFR0_EL1: return (u64)read_cpuid(ID_AA64DFR0_EL1); - case SYS_ID_AA64DFR1_EL1: return (u64)read_cpuid(ID_AA64DFR0_EL1); - case SYS_ID_AA64MMFR0_EL1: return (u64)read_cpuid(ID_AA64MMFR0_EL1); - case SYS_ID_AA64MMFR1_EL1: return (u64)read_cpuid(ID_AA64MMFR1_EL1); - case SYS_ID_AA64ISAR0_EL1: return (u64)read_cpuid(ID_AA64ISAR0_EL1); - case SYS_ID_AA64ISAR1_EL1: return (u64)read_cpuid(ID_AA64ISAR1_EL1); + case SYS_ID_AA64PFR0_EL1: return read_cpuid(SYS_ID_AA64PFR0_EL1); + case SYS_ID_AA64PFR1_EL1: return read_cpuid(SYS_ID_AA64PFR0_EL1); + case SYS_ID_AA64DFR0_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1); + case SYS_ID_AA64DFR1_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1); + case SYS_ID_AA64MMFR0_EL1: return read_cpuid(SYS_ID_AA64MMFR0_EL1); + case SYS_ID_AA64MMFR1_EL1: return read_cpuid(SYS_ID_AA64MMFR1_EL1); + case SYS_ID_AA64ISAR0_EL1: return read_cpuid(SYS_ID_AA64ISAR0_EL1); + case SYS_ID_AA64ISAR1_EL1: return read_cpuid(SYS_ID_AA64ISAR1_EL1); - case SYS_CNTFRQ_EL0: return (u64)read_cpuid(CNTFRQ_EL0); - case SYS_CTR_EL0: return (u64)read_cpuid(CTR_EL0); - case SYS_DCZID_EL0: return (u64)read_cpuid(DCZID_EL0); + case SYS_CNTFRQ_EL0: return read_cpuid(SYS_CNTFRQ_EL0); + case SYS_CTR_EL0: return read_cpuid(SYS_CTR_EL0); + case SYS_DCZID_EL0: return read_cpuid(SYS_DCZID_EL0); default: BUG(); return 0; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 212ae6361d8b..76df22272804 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -201,35 +201,35 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) { info->reg_cntfrq = arch_timer_get_cntfrq(); info->reg_ctr = read_cpuid_cachetype(); - info->reg_dczid = read_cpuid(DCZID_EL0); + info->reg_dczid = read_cpuid(SYS_DCZID_EL0); info->reg_midr = read_cpuid_id(); - info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1); - info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); - info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1); - info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1); - info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); - info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); - info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); - info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); + info->reg_id_aa64dfr0 = read_cpuid(SYS_ID_AA64DFR0_EL1); + info->reg_id_aa64dfr1 = read_cpuid(SYS_ID_AA64DFR1_EL1); + info->reg_id_aa64isar0 = read_cpuid(SYS_ID_AA64ISAR0_EL1); + info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1); + info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1); + info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1); + info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1); + info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1); - info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); - info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); - info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); - info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); - info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); - info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); - info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); - info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); - info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); - info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); - info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); - info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); - info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); + info->reg_id_dfr0 = read_cpuid(SYS_ID_DFR0_EL1); + info->reg_id_isar0 = read_cpuid(SYS_ID_ISAR0_EL1); + info->reg_id_isar1 = read_cpuid(SYS_ID_ISAR1_EL1); + info->reg_id_isar2 = read_cpuid(SYS_ID_ISAR2_EL1); + info->reg_id_isar3 = read_cpuid(SYS_ID_ISAR3_EL1); + info->reg_id_isar4 = read_cpuid(SYS_ID_ISAR4_EL1); + info->reg_id_isar5 = read_cpuid(SYS_ID_ISAR5_EL1); + info->reg_id_mmfr0 = read_cpuid(SYS_ID_MMFR0_EL1); + info->reg_id_mmfr1 = read_cpuid(SYS_ID_MMFR1_EL1); + info->reg_id_mmfr2 = read_cpuid(SYS_ID_MMFR2_EL1); + info->reg_id_mmfr3 = read_cpuid(SYS_ID_MMFR3_EL1); + info->reg_id_pfr0 = read_cpuid(SYS_ID_PFR0_EL1); + info->reg_id_pfr1 = read_cpuid(SYS_ID_PFR1_EL1); - info->reg_mvfr0 = read_cpuid(MVFR0_EL1); - info->reg_mvfr1 = read_cpuid(MVFR1_EL1); - info->reg_mvfr2 = read_cpuid(MVFR2_EL1); + info->reg_mvfr0 = read_cpuid(SYS_MVFR0_EL1); + info->reg_mvfr1 = read_cpuid(SYS_MVFR1_EL1); + info->reg_mvfr2 = read_cpuid(SYS_MVFR2_EL1); cpuinfo_detect_icache_policy(info); diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index e87f53ff5f58..7275628ba59f 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -187,7 +187,7 @@ switch_mm_fastpath: static int asids_init(void) { - int fld = cpuid_feature_extract_field(read_cpuid(ID_AA64MMFR0_EL1), 4); + int fld = cpuid_feature_extract_field(read_cpuid(SYS_ID_AA64MMFR0_EL1), 4); switch (fld) { default: From 8d6623808d9b75778dce5afba7fbbbd387eb4160 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:47 +0000 Subject: [PATCH 0410/3220] UPSTREAM: arm64: add ARMv8.2 id_aa64mmfr2 boiler plate ARMv8.2 adds a new feature register id_aa64mmfr2. This patch adds the cpu feature boiler plate used by the actual features in later patches. Signed-off-by: James Morse Reviewed-by: Suzuki K Poulose Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 406e308770a92bd33995b2e5b681e86358328bb0) Signed-off-by: Jeff Vander Stoep Change-Id: I51db326696ba1e18e9a4c667acbeb3e25e0b151e --- arch/arm64/include/asm/cpu.h | 1 + arch/arm64/include/asm/sysreg.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 10 ++++++++++ arch/arm64/kernel/cpuinfo.c | 1 + 4 files changed, 16 insertions(+) diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index b5e9cee4b5f8..13a6103130cd 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -36,6 +36,7 @@ struct cpuinfo_arm64 { u64 reg_id_aa64isar1; u64 reg_id_aa64mmfr0; u64 reg_id_aa64mmfr1; + u64 reg_id_aa64mmfr2; u64 reg_id_aa64pfr0; u64 reg_id_aa64pfr1; diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 76907c94b11f..4bc8655529df 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -70,6 +70,7 @@ #define SYS_ID_AA64MMFR0_EL1 sys_reg(3, 0, 0, 7, 0) #define SYS_ID_AA64MMFR1_EL1 sys_reg(3, 0, 0, 7, 1) +#define SYS_ID_AA64MMFR2_EL1 sys_reg(3, 0, 0, 7, 2) #define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0) #define SYS_CTR_EL0 sys_reg(3, 3, 0, 0, 1) @@ -135,6 +136,9 @@ #define ID_AA64MMFR1_VMIDBITS_SHIFT 4 #define ID_AA64MMFR1_HADBS_SHIFT 0 +/* id_aa64mmfr2 */ +#define ID_AA64MMFR2_UAO_SHIFT 4 + /* id_aa64dfr0 */ #define ID_AA64DFR0_CTX_CMPS_SHIFT 28 #define ID_AA64DFR0_WRPS_SHIFT 20 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 1ef10e784031..42918c797e8e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -123,6 +123,11 @@ static struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { ARM64_FTR_END, }; +static struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_UAO_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static struct arm64_ftr_bits ftr_ctr[] = { U_ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 3, 0), @@ -284,6 +289,7 @@ static struct arm64_ftr_reg arm64_ftr_regs[] = { /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1), + ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), /* Op1 = 3, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_CTR_EL0, ftr_ctr), @@ -408,6 +414,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); + init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); @@ -517,6 +524,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64mmfr0, boot->reg_id_aa64mmfr0); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR1_EL1, cpu, info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu, + info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); /* * EL3 is not our concern. @@ -831,6 +840,7 @@ static u64 __raw_read_system_reg(u32 sys_id) case SYS_ID_AA64DFR1_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1); case SYS_ID_AA64MMFR0_EL1: return read_cpuid(SYS_ID_AA64MMFR0_EL1); case SYS_ID_AA64MMFR1_EL1: return read_cpuid(SYS_ID_AA64MMFR1_EL1); + case SYS_ID_AA64MMFR2_EL1: return read_cpuid(SYS_ID_AA64MMFR2_EL1); case SYS_ID_AA64ISAR0_EL1: return read_cpuid(SYS_ID_AA64ISAR0_EL1); case SYS_ID_AA64ISAR1_EL1: return read_cpuid(SYS_ID_AA64ISAR1_EL1); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 76df22272804..966fbd52550b 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -210,6 +210,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1); info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1); info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1); + info->reg_id_aa64mmfr2 = read_cpuid(SYS_ID_AA64MMFR2_EL1); info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1); From dba95a8428b15b2cff37f5b07e4539ccb82a330e Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:48 +0000 Subject: [PATCH 0411/3220] UPSTREAM: arm64: kernel: Add support for User Access Override 'User Access Override' is a new ARMv8.2 feature which allows the unprivileged load and store instructions to be overridden to behave in the normal way. This patch converts {get,put}_user() and friends to use ldtr*/sttr* instructions - so that they can only access EL0 memory, then enables UAO when fs==KERNEL_DS so that these functions can access kernel memory. This allows user space's read/write permissions to be checked against the page tables, instead of testing addr [catalin.marinas@arm.com: move uao_thread_switch() above dsb()] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 57f4959bad0a154aeca125b7d38d1d9471a12422) Signed-off-by: Jeff Vander Stoep Change-Id: I1a6a74a1f33b92d54368bd99387b55cf62930903 --- arch/arm64/Kconfig | 21 ++++++++ arch/arm64/include/asm/alternative.h | 72 ++++++++++++++++++++++++++++ arch/arm64/include/asm/cpufeature.h | 3 +- arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 3 ++ arch/arm64/include/asm/thread_info.h | 6 +++ arch/arm64/include/asm/uaccess.h | 44 ++++++++++++----- arch/arm64/include/uapi/asm/ptrace.h | 1 + arch/arm64/kernel/cpufeature.c | 11 +++++ arch/arm64/kernel/process.c | 19 ++++++++ arch/arm64/lib/clear_user.S | 8 ++-- arch/arm64/lib/copy_from_user.S | 8 ++-- arch/arm64/lib/copy_in_user.S | 16 +++---- arch/arm64/lib/copy_to_user.S | 8 ++-- arch/arm64/mm/fault.c | 31 +++++++++--- 15 files changed, 213 insertions(+), 39 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 71ce487d7b78..ae51e1b18337 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -740,6 +740,27 @@ config ARM64_LSE_ATOMICS endmenu +config ARM64_UAO + bool "Enable support for User Access Override (UAO)" + default y + help + User Access Override (UAO; part of the ARMv8.2 Extensions) + causes the 'unprivileged' variant of the load/store instructions to + be overriden to be privileged. + + This option changes get_user() and friends to use the 'unprivileged' + variant of the load/store instructions. This ensures that user-space + really did have access to the supplied memory. When addr_limit is + set to kernel memory the UAO bit will be set, allowing privileged + access to kernel memory. + + Choosing this option will cause copy_to_user() et al to use user-space + memory permissions. + + The feature is detected at runtime, the kernel will use the + regular load/store instructions if the cpu does not implement the + feature. + endmenu menu "Boot options" diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index e4962f04201e..a9fc24ec1aa9 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -1,6 +1,8 @@ #ifndef __ASM_ALTERNATIVE_H #define __ASM_ALTERNATIVE_H +#include + #ifndef __ASSEMBLY__ #include @@ -63,6 +65,8 @@ void apply_alternatives(void *start, size_t length); #else +#include + .macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len .word \orig_offset - . .word \alt_offset - . @@ -136,6 +140,74 @@ void apply_alternatives(void *start, size_t length); alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) +/* + * Generate the assembly for UAO alternatives with exception table entries. + * This is complicated as there is no post-increment or pair versions of the + * unprivileged instructions, and USER() only works for single instructions. + */ +#ifdef CONFIG_ARM64_UAO + .macro uao_ldp l, reg1, reg2, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: ldp \reg1, \reg2, [\addr], \post_inc; +8889: nop; + nop; + alternative_else + ldtr \reg1, [\addr]; + ldtr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + alternative_endif + + .section __ex_table,"a"; + .align 3; + .quad 8888b,\l; + .quad 8889b,\l; + .previous; + .endm + + .macro uao_stp l, reg1, reg2, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: stp \reg1, \reg2, [\addr], \post_inc; +8889: nop; + nop; + alternative_else + sttr \reg1, [\addr]; + sttr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + alternative_endif + + .section __ex_table,"a"; + .align 3; + .quad 8888b,\l; + .quad 8889b,\l; + .previous + .endm + + .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: \inst \reg, [\addr], \post_inc; + nop; + alternative_else + \alt_inst \reg, [\addr]; + add \addr, \addr, \post_inc; + alternative_endif + + .section __ex_table,"a"; + .align 3; + .quad 8888b,\l; + .previous + .endm +#else + .macro uao_ldp l, reg1, reg2, addr, post_inc + USER(\l, ldp \reg1, \reg2, [\addr], \post_inc) + .endm + .macro uao_stp l, reg1, reg2, addr, post_inc + USER(\l, stp \reg1, \reg2, [\addr], \post_inc) + .endm + .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc + USER(\l, \inst \reg, [\addr], \post_inc) + .endm +#endif + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8131abfabb0a..a5df7cde616b 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -31,8 +31,9 @@ #define ARM64_WORKAROUND_CAVIUM_23154 6 #define ARM64_WORKAROUND_834220 7 #define ARM64_HAS_NO_HW_PREFETCH 8 +#define ARM64_HAS_UAO 9 -#define ARM64_NCAPS 9 +#define ARM64_NCAPS 10 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 5bb1d763d17a..cef1cf398356 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -191,5 +191,6 @@ static inline void spin_lock_prefetch(const void *ptr) #endif void cpu_enable_pan(void *__unused); +void cpu_enable_uao(void *__unused); #endif /* __ASM_PROCESSOR_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 4bc8655529df..b9fd8ec79033 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -77,9 +77,12 @@ #define SYS_DCZID_EL0 sys_reg(3, 3, 0, 0, 7) #define REG_PSTATE_PAN_IMM sys_reg(0, 0, 4, 0, 4) +#define REG_PSTATE_UAO_IMM sys_reg(0, 0, 4, 0, 3) #define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\ (!!x)<<8 | 0x1f) +#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\ + (!!x)<<8 | 0x1f) /* SCTLR_EL1 */ #define SCTLR_EL1_CP15BEN (0x1 << 5) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index abd64bd1f6d9..eba8db6838af 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -85,6 +85,12 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)sp_el0; } +/* Access struct thread_info of another thread */ +static inline struct thread_info *get_thread_info(unsigned long thread_stack) +{ + return (struct thread_info *)(thread_stack & ~(THREAD_SIZE - 1)); +} + #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_sp(tsk) \ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 2a44b9795532..a227ba376f23 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -64,6 +64,16 @@ extern int fixup_exception(struct pt_regs *regs); static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; + + /* + * Enable/disable UAO so that copy_to_user() etc can access + * kernel memory with the unprivileged instructions. + */ + if (IS_ENABLED(CONFIG_ARM64_UAO) && fs == KERNEL_DS) + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); + else + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, + CONFIG_ARM64_UAO)); } #define segment_eq(a, b) ((a) == (b)) @@ -113,9 +123,10 @@ static inline void set_fs(mm_segment_t fs) * The "__xxx_error" versions set the third argument to -EFAULT if an error * occurs, and leave it unchanged on success. */ -#define __get_user_asm(instr, reg, x, addr, err) \ +#define __get_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ asm volatile( \ - "1: " instr " " reg "1, [%2]\n" \ + "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ + alt_instr " " reg "1, [%2]\n", feature) \ "2:\n" \ " .section .fixup, \"ax\"\n" \ " .align 2\n" \ @@ -138,16 +149,20 @@ do { \ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_user_asm("ldrb", "%w", __gu_val, (ptr), (err)); \ + __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 2: \ - __get_user_asm("ldrh", "%w", __gu_val, (ptr), (err)); \ + __get_user_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 4: \ - __get_user_asm("ldr", "%w", __gu_val, (ptr), (err)); \ + __get_user_asm("ldr", "ldtr", "%w", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 8: \ - __get_user_asm("ldr", "%", __gu_val, (ptr), (err)); \ + __get_user_asm("ldr", "ldtr", "%", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ default: \ BUILD_BUG(); \ @@ -181,9 +196,10 @@ do { \ ((x) = 0, -EFAULT); \ }) -#define __put_user_asm(instr, reg, x, addr, err) \ +#define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ asm volatile( \ - "1: " instr " " reg "1, [%2]\n" \ + "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ + alt_instr " " reg "1, [%2]\n", feature) \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .align 2\n" \ @@ -205,16 +221,20 @@ do { \ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ - __put_user_asm("strb", "%w", __pu_val, (ptr), (err)); \ + __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 2: \ - __put_user_asm("strh", "%w", __pu_val, (ptr), (err)); \ + __put_user_asm("strh", "sttrh", "%w", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 4: \ - __put_user_asm("str", "%w", __pu_val, (ptr), (err)); \ + __put_user_asm("str", "sttr", "%w", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 8: \ - __put_user_asm("str", "%", __pu_val, (ptr), (err)); \ + __put_user_asm("str", "sttr", "%", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ default: \ BUILD_BUG(); \ diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 208db3df135a..b5c3933ed441 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -45,6 +45,7 @@ #define PSR_A_BIT 0x00000100 #define PSR_D_BIT 0x00000200 #define PSR_PAN_BIT 0x00400000 +#define PSR_UAO_BIT 0x00800000 #define PSR_Q_BIT 0x08000000 #define PSR_V_BIT 0x10000000 #define PSR_C_BIT 0x20000000 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 42918c797e8e..ae22edf9d3c9 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -677,6 +677,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_NO_HW_PREFETCH, .matches = has_no_hw_prefetch, }, +#ifdef CONFIG_ARM64_UAO + { + .desc = "User Access Override", + .capability = ARM64_HAS_UAO, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .field_pos = ID_AA64MMFR2_UAO_SHIFT, + .min_field_value = 1, + .enable = cpu_enable_uao, + }, +#endif /* CONFIG_ARM64_UAO */ {}, }; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 5f6244526e31..4431e8364881 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -46,6 +46,7 @@ #include #include +#include #include #include #include @@ -346,6 +347,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, } else { memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h; + if (IS_ENABLED(CONFIG_ARM64_UAO) && + cpus_have_cap(ARM64_HAS_UAO)) + childregs->pstate |= PSR_UAO_BIT; p->thread.cpu_context.x19 = stack_start; p->thread.cpu_context.x20 = stk_sz; } @@ -374,6 +378,20 @@ static void tls_thread_switch(struct task_struct *next) : : "r" (tpidr), "r" (tpidrro)); } +/* Restore the UAO state depending on next's addr_limit */ +static void uao_thread_switch(struct task_struct *next) +{ + unsigned long next_sp = next->thread.cpu_context.sp; + + if (IS_ENABLED(CONFIG_ARM64_UAO) && + get_thread_info(next_sp)->addr_limit == KERNEL_DS) + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO, + CONFIG_ARM64_UAO)); + else + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, + CONFIG_ARM64_UAO)); +} + /* * Thread switching. */ @@ -386,6 +404,7 @@ struct task_struct *__switch_to(struct task_struct *prev, tls_thread_switch(next); hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); + uao_thread_switch(next); /* * Complete any pending TLB or cache maintenance on this CPU in case diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index a9723c71c52b..3f950b677c07 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -39,20 +39,20 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ subs x1, x1, #8 b.mi 2f 1: -USER(9f, str xzr, [x0], #8 ) +uao_user_alternative 9f, str, sttr, xzr, x0, 8 subs x1, x1, #8 b.pl 1b 2: adds x1, x1, #4 b.mi 3f -USER(9f, str wzr, [x0], #4 ) +uao_user_alternative 9f, str, sttr, wzr, x0, 4 sub x1, x1, #4 3: adds x1, x1, #2 b.mi 4f -USER(9f, strh wzr, [x0], #2 ) +uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 sub x1, x1, #2 4: adds x1, x1, #1 b.mi 5f -USER(9f, strb wzr, [x0] ) +uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 281e75db899a..be09fda09986 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -34,7 +34,7 @@ */ .macro ldrb1 ptr, regB, val - USER(9998f, ldrb \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val .endm .macro strb1 ptr, regB, val @@ -42,7 +42,7 @@ .endm .macro ldrh1 ptr, regB, val - USER(9998f, ldrh \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val .endm .macro strh1 ptr, regB, val @@ -50,7 +50,7 @@ .endm .macro ldr1 ptr, regB, val - USER(9998f, ldr \ptr, [\regB], \val) + uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val .endm .macro str1 ptr, regB, val @@ -58,7 +58,7 @@ .endm .macro ldp1 ptr, regB, regC, val - USER(9998f, ldp \ptr, \regB, [\regC], \val) + uao_ldp 9998f, \ptr, \regB, \regC, \val .endm .macro stp1 ptr, regB, regC, val diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 81c8fc93c100..feaad1520dc1 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -35,35 +35,35 @@ * x0 - bytes not copied */ .macro ldrb1 ptr, regB, val - USER(9998f, ldrb \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val .endm .macro strb1 ptr, regB, val - USER(9998f, strb \ptr, [\regB], \val) + uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val .endm .macro ldrh1 ptr, regB, val - USER(9998f, ldrh \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val .endm .macro strh1 ptr, regB, val - USER(9998f, strh \ptr, [\regB], \val) + uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val .endm .macro ldr1 ptr, regB, val - USER(9998f, ldr \ptr, [\regB], \val) + uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val .endm .macro str1 ptr, regB, val - USER(9998f, str \ptr, [\regB], \val) + uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val .endm .macro ldp1 ptr, regB, regC, val - USER(9998f, ldp \ptr, \regB, [\regC], \val) + uao_ldp 9998f, \ptr, \regB, \regC, \val .endm .macro stp1 ptr, regB, regC, val - USER(9998f, stp \ptr, \regB, [\regC], \val) + uao_stp 9998f, \ptr, \regB, \regC, \val .endm end .req x5 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index db4d187de61f..b4b9178aa36b 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -37,7 +37,7 @@ .endm .macro strb1 ptr, regB, val - USER(9998f, strb \ptr, [\regB], \val) + uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val .endm .macro ldrh1 ptr, regB, val @@ -45,7 +45,7 @@ .endm .macro strh1 ptr, regB, val - USER(9998f, strh \ptr, [\regB], \val) + uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val .endm .macro ldr1 ptr, regB, val @@ -53,7 +53,7 @@ .endm .macro str1 ptr, regB, val - USER(9998f, str \ptr, [\regB], \val) + uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val .endm .macro ldp1 ptr, regB, regC, val @@ -61,7 +61,7 @@ .endm .macro stp1 ptr, regB, regC, val - USER(9998f, stp \ptr, \regB, [\regC], \val) + uao_stp 9998f, \ptr, \regB, \regC, \val .endm end .req x5 diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 92ddac1e8ca2..820d47353cf0 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -192,6 +192,14 @@ out: return fault; } +static inline int permission_fault(unsigned int esr) +{ + unsigned int ec = (esr & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT; + unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; + + return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); +} + static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -225,12 +233,10 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - /* - * PAN bit set implies the fault happened in kernel space, but not - * in the arch's user access functions. - */ - if (IS_ENABLED(CONFIG_ARM64_PAN) && (regs->pstate & PSR_PAN_BIT)) - goto no_context; + if (permission_fault(esr) && (addr < USER_DS)) { + if (!search_exception_tables(regs->pc)) + panic("Accessing user space memory outside uaccess.h routines"); + } /* * As per x86, we may deadlock here. However, since the kernel only @@ -561,3 +567,16 @@ void cpu_enable_pan(void *__unused) config_sctlr_el1(SCTLR_EL1_SPAN, 0); } #endif /* CONFIG_ARM64_PAN */ + +#ifdef CONFIG_ARM64_UAO +/* + * Kernel threads have fs=KERNEL_DS by default, and don't need to call + * set_fs(), devtmpfs in particular relies on this behaviour. + * We need to enable the feature at runtime (instead of adding it to + * PSR_MODE_EL1h) as the feature may not be implemented by the cpu. + */ +void cpu_enable_uao(void *__unused) +{ + asm(SET_PSTATE_UAO(1)); +} +#endif /* CONFIG_ARM64_UAO */ From f7a32945ec495b6b88c251bd240b0d54f6636f06 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:49 +0000 Subject: [PATCH 0412/3220] UPSTREAM: arm64: cpufeature: Test 'matches' pointer to find the end of the list CPU feature code uses the desc field as a test to find the end of the list, this means every entry must have a description. This generates noise for entries in the list that aren't really features, but combinations of them. e.g. > CPU features: detected feature: Privileged Access Never > CPU features: detected feature: PAN and not UAO These combination features are needed for corner cases with alternatives, where cpu features interact. Change all walkers of the arm64_features[] and arm64_hwcaps[] lists to test 'matches' not 'desc', and only print 'desc' if it is non-NULL. Signed-off-by: James Morse Reviewed-by : Suzuki K Poulose Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 644c2ae198412c956700e55a2acf80b2541f6aa5) Signed-off-by: Jeff Vander Stoep Change-Id: I4500bb7c547e2e67ea56e242a8621df539f6fd67 --- arch/arm64/kernel/cpufeature.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index ae22edf9d3c9..9cc8186cd14b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -771,7 +771,7 @@ static void __init setup_cpu_hwcaps(void) int i; const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps; - for (i = 0; hwcaps[i].desc; i++) + for (i = 0; hwcaps[i].matches; i++) if (hwcaps[i].matches(&hwcaps[i])) cap_set_hwcap(&hwcaps[i]); } @@ -781,11 +781,11 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, { int i; - for (i = 0; caps[i].desc; i++) { + for (i = 0; caps[i].matches; i++) { if (!caps[i].matches(&caps[i])) continue; - if (!cpus_have_cap(caps[i].capability)) + if (!cpus_have_cap(caps[i].capability) && caps[i].desc) pr_info("%s %s\n", info, caps[i].desc); cpus_set_cap(caps[i].capability); } @@ -800,7 +800,7 @@ enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) { int i; - for (i = 0; caps[i].desc; i++) + for (i = 0; caps[i].matches; i++) if (caps[i].enable && cpus_have_cap(caps[i].capability)) on_each_cpu(caps[i].enable, NULL, true); } @@ -907,7 +907,7 @@ void verify_local_cpu_capabilities(void) return; caps = arm64_features; - for (i = 0; caps[i].desc; i++) { + for (i = 0; caps[i].matches; i++) { if (!cpus_have_cap(caps[i].capability) || !caps[i].sys_reg) continue; /* @@ -920,7 +920,7 @@ void verify_local_cpu_capabilities(void) caps[i].enable(NULL); } - for (i = 0, caps = arm64_hwcaps; caps[i].desc; i++) { + for (i = 0, caps = arm64_hwcaps; caps[i].matches; i++) { if (!cpus_have_hwcap(&caps[i])) continue; if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i])) From 612055f383a023e12042f75d6d951e76ee4c5748 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:50 +0000 Subject: [PATCH 0413/3220] BACKPORT: arm64: kernel: Don't toggle PAN on systems with UAO If a CPU supports both Privileged Access Never (PAN) and User Access Override (UAO), we don't need to disable/re-enable PAN round all copy_to_user() like calls. UAO alternatives cause these calls to use the 'unprivileged' load/store instructions, which are overridden to be the privileged kind when fs==KERNEL_DS. This patch changes the copy_to_user() calls to have their PAN toggling depend on a new composite 'feature' ARM64_ALT_PAN_NOT_UAO. If both features are detected, PAN will be enabled, but the copy_to_user() alternatives will not be applied. This means PAN will be enabled all the time for these functions. If only PAN is detected, the toggling will be enabled as normal. This will save the time taken to disable/re-enable PAN, and allow us to catch copy_to_user() accesses that occur with fs==KERNEL_DS. Futex and swp-emulation code continue to hang their PAN toggling code on ARM64_HAS_PAN. Signed-off-by: James Morse Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 705441960033e66b63524521f153fbb28c99ddbd) Signed-off-by: Jeff Vander Stoep Change-Id: I3fa35ebacaf401e1344e76932a26fdd14a8a3cdb --- arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/include/asm/uaccess.h | 8 ++++---- arch/arm64/kernel/cpufeature.c | 16 ++++++++++++++++ arch/arm64/lib/clear_user.S | 4 ++-- arch/arm64/lib/copy_from_user.S | 4 ++-- arch/arm64/lib/copy_in_user.S | 4 ++-- arch/arm64/lib/copy_to_user.S | 4 ++-- arch/arm64/mm/fault.c | 3 +++ 8 files changed, 33 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index a5df7cde616b..37a53fc6b384 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -32,8 +32,9 @@ #define ARM64_WORKAROUND_834220 7 #define ARM64_HAS_NO_HW_PREFETCH 8 #define ARM64_HAS_UAO 9 +#define ARM64_ALT_PAN_NOT_UAO 10 -#define ARM64_NCAPS 10 +#define ARM64_NCAPS 11 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index a227ba376f23..7b07e73b6316 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -145,7 +145,7 @@ static inline void set_fs(mm_segment_t fs) do { \ unsigned long __gu_val; \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -168,7 +168,7 @@ do { \ BUILD_BUG(); \ } \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ } while (0) @@ -217,7 +217,7 @@ do { \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -239,7 +239,7 @@ do { \ default: \ BUILD_BUG(); \ } \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ } while (0) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9cc8186cd14b..7566cad9fa1d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -67,6 +67,10 @@ DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); .width = 0, \ } +/* meta feature for alternatives */ +static bool __maybe_unused +cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry); + static struct arm64_ftr_bits ftr_id_aa64isar0[] = { ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0), ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_RDM_SHIFT, 4, 0), @@ -688,6 +692,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .enable = cpu_enable_uao, }, #endif /* CONFIG_ARM64_UAO */ +#ifdef CONFIG_ARM64_PAN + { + .capability = ARM64_ALT_PAN_NOT_UAO, + .matches = cpufeature_pan_not_uao, + }, +#endif /* CONFIG_ARM64_PAN */ {}, }; @@ -966,3 +976,9 @@ void __init setup_cpu_features(void) pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n", L1_CACHE_BYTES, cls); } + +static bool __maybe_unused +cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry) +{ + return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO)); +} diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 3f950b677c07..5d1cad3ce6d6 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -33,7 +33,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x2, x1 // save the size for fixup return subs x1, x1, #8 @@ -54,7 +54,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index be09fda09986..0b90497d4424 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -67,11 +67,11 @@ end .req x5 ENTRY(__arch_copy_from_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 // Nothing to copy ret diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index feaad1520dc1..f7292dd08c84 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -68,11 +68,11 @@ end .req x5 ENTRY(__copy_in_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 ret diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index b4b9178aa36b..7a7efe255034 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -66,11 +66,11 @@ end .req x5 ENTRY(__arch_copy_to_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 ret diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 820d47353cf0..d0762a729d01 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -234,6 +234,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (permission_fault(esr) && (addr < USER_DS)) { + if (get_thread_info(regs->sp)->addr_limit == KERNEL_DS) + panic("Accessing user space memory with fs=KERNEL_DS"); + if (!search_exception_tables(regs->pc)) panic("Accessing user space memory outside uaccess.h routines"); } From c39e2026d6c2daef475d66cc0656769f93b42fc4 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 18 Feb 2016 15:50:04 +0000 Subject: [PATCH 0414/3220] UPSTREAM: arm64: Remove the get_thread_info() function This function was introduced by previous commits implementing UAO. However, it can be replaced with task_thread_info() in uao_thread_switch() or get_fs() in do_page_fault() (the latter being called only on the current context, so no need for using the saved pt_regs). Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit e950631e84e7e38892ffbeee5e1816b270026b0e) Signed-off-by: Jeff Vander Stoep Change-Id: Ic6e9b6af7314fa83d9b0773ae3fac5a2ff34e67a --- arch/arm64/include/asm/thread_info.h | 6 ------ arch/arm64/kernel/process.c | 15 ++++++--------- arch/arm64/mm/fault.c | 2 +- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index eba8db6838af..abd64bd1f6d9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -85,12 +85,6 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)sp_el0; } -/* Access struct thread_info of another thread */ -static inline struct thread_info *get_thread_info(unsigned long thread_stack) -{ - return (struct thread_info *)(thread_stack & ~(THREAD_SIZE - 1)); -} - #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_sp(tsk) \ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 4431e8364881..6f3fb46170bf 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -381,15 +381,12 @@ static void tls_thread_switch(struct task_struct *next) /* Restore the UAO state depending on next's addr_limit */ static void uao_thread_switch(struct task_struct *next) { - unsigned long next_sp = next->thread.cpu_context.sp; - - if (IS_ENABLED(CONFIG_ARM64_UAO) && - get_thread_info(next_sp)->addr_limit == KERNEL_DS) - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO, - CONFIG_ARM64_UAO)); - else - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, - CONFIG_ARM64_UAO)); + if (IS_ENABLED(CONFIG_ARM64_UAO)) { + if (task_thread_info(next)->addr_limit == KERNEL_DS) + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); + else + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO)); + } } /* diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index d0762a729d01..a8eafeceb08a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -234,7 +234,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (permission_fault(esr) && (addr < USER_DS)) { - if (get_thread_info(regs->sp)->addr_limit == KERNEL_DS) + if (get_fs() == KERNEL_DS) panic("Accessing user space memory with fs=KERNEL_DS"); if (!search_exception_tables(regs->pc)) From 6bfaecfc5801909762309f4ec831f789581f279b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:32 +0100 Subject: [PATCH 0415/3220] UPSTREAM: of/fdt: make memblock minimum physical address arch configurable By default, early_init_dt_add_memory_arch() ignores memory below the base of the kernel image since it won't be addressable via the linear mapping. However, this is not appropriate anymore once we decouple the kernel text mapping from the linear mapping, so archs may want to drop the low limit entirely. So allow the minimum to be overridden by setting MIN_MEMBLOCK_ADDR. Acked-by: Mark Rutland Acked-by: Rob Herring Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 270522a04f7a9911983878fa37da467f9ff1c938) Signed-off-by: Jeff Vander Stoep Change-Id: I4bb2626a87493262a64584b3d808de260129127e --- drivers/of/fdt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 86fff625f6d5..628f436e16f6 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1006,13 +1006,16 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, } #ifdef CONFIG_HAVE_MEMBLOCK +#ifndef MIN_MEMBLOCK_ADDR +#define MIN_MEMBLOCK_ADDR __pa(PAGE_OFFSET) +#endif #ifndef MAX_MEMBLOCK_ADDR #define MAX_MEMBLOCK_ADDR ((phys_addr_t)~0) #endif void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) { - const u64 phys_offset = __pa(PAGE_OFFSET); + const u64 phys_offset = MIN_MEMBLOCK_ADDR; if (!PAGE_ALIGNED(base)) { if (size < PAGE_SIZE - (base & ~PAGE_MASK)) { From 6f77d1a3662212ea72e5c37c381bf0d2a09a463e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:33 +0100 Subject: [PATCH 0416/3220] UPSTREAM: of/fdt: factor out assignment of initrd_start/initrd_end Since architectures may not yet have their linear mapping up and running when the initrd address is discovered from the DT, factor out the assignment of initrd_start and initrd_end, so that an architecture can override it and use the translation it needs. Signed-off-by: Ard Biesheuvel Acked-by: Rob Herring Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 369bc9abf22bf026e8645a4dd746b90649a2f6ee) Signed-off-by: Jeff Vander Stoep Change-Id: I8c258bdc4367955314e9a5223dc4c7751a06a98d --- drivers/of/fdt.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 628f436e16f6..c6d196188bc9 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -760,6 +760,16 @@ const void * __init of_flat_dt_match_machine(const void *default_match, } #ifdef CONFIG_BLK_DEV_INITRD +#ifndef __early_init_dt_declare_initrd +static void __early_init_dt_declare_initrd(unsigned long start, + unsigned long end) +{ + initrd_start = (unsigned long)__va(start); + initrd_end = (unsigned long)__va(end); + initrd_below_start_ok = 1; +} +#endif + /** * early_init_dt_check_for_initrd - Decode initrd location from flat tree * @node: reference to node containing initrd location ('chosen') @@ -782,9 +792,7 @@ static void __init early_init_dt_check_for_initrd(unsigned long node) return; end = of_read_number(prop, len/4); - initrd_start = (unsigned long)__va(start); - initrd_end = (unsigned long)__va(end); - initrd_below_start_ok = 1; + __early_init_dt_declare_initrd(start, end); pr_debug("initrd_start=0x%llx initrd_end=0x%llx\n", (unsigned long long)start, (unsigned long long)end); From 6727d505317c6032c58063c46fe317045f3dacb4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:34 +0100 Subject: [PATCH 0417/3220] BACKPORT: arm64: prevent potential circular header dependencies in asm/bug.h Currently, using BUG_ON() in header files is cumbersome, due to the fact that asm/bug.h transitively includes a lot of other header files, resulting in the actual BUG_ON() invocation appearing before its definition in the preprocessor input. So let's reverse the #include dependency between asm/bug.h and asm/debug-monitors.h, by moving the definition of BUG_BRK_IMM from the latter to the former. Also fix up one user of asm/debug-monitors.h which relied on a transitive include. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 03336b1df9929e5d9c28fd9768948b6151cb046c) Signed-off-by: Jeff Vander Stoep Change-Id: I71470b7db9ef858a5a8368a872f931936c723a25 --- arch/arm64/include/asm/bug.h | 2 +- arch/arm64/include/asm/debug-monitors.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 4a748ce9ba1a..679d49221998 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -18,7 +18,7 @@ #ifndef _ARCH_ARM64_ASM_BUG_H #define _ARCH_ARM64_ASM_BUG_H -#include +#define BUG_BRK_IMM 0x800 #ifdef CONFIG_GENERIC_BUG #define HAVE_ARCH_BUG diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 279c85b5ec09..e893a1fca9c2 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -57,7 +58,6 @@ #define FAULT_BRK_IMM 0x100 #define KGDB_DYN_DBG_BRK_IMM 0x400 #define KGDB_COMPILED_DBG_BRK_IMM 0x401 -#define BUG_BRK_IMM 0x800 /* * BRK instruction encoding From e3ec18326ffe69b6736232499ddf8634ed40601c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:35 +0100 Subject: [PATCH 0418/3220] BACKPORT: arm64: add support for ioremap() block mappings This wires up the existing generic huge-vmap feature, which allows ioremap() to use PMD or PUD sized block mappings. It also adds support to the unmap path for dealing with block mappings, which will allow us to unmap the __init region using unmap_kernel_range() in a subsequent patch. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 324420bf91f60582bb481133db9547111768ef17) Signed-off-by: Jeff Vander Stoep Change-Id: I4765ae77f7d67c3972b7e5b19d43db434e8b777c --- .../features/vm/huge-vmap/arch-support.txt | 2 +- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/memory.h | 6 +++ arch/arm64/mm/mmu.c | 41 +++++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt index af6816bccb43..df1d1f3c9af2 100644 --- a/Documentation/features/vm/huge-vmap/arch-support.txt +++ b/Documentation/features/vm/huge-vmap/arch-support.txt @@ -9,7 +9,7 @@ | alpha: | TODO | | arc: | TODO | | arm: | TODO | - | arm64: | TODO | + | arm64: | ok | | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ae51e1b18337..14866e975277 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -50,6 +50,7 @@ config ARM64 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE select HAVE_ARCH_HARDENED_USERCOPY + select HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KGDB diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 853953cd1f08..c65aad7b13dc 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -100,6 +100,12 @@ #define MT_S2_NORMAL 0xf #define MT_S2_DEVICE_nGnRE 0x1 +#ifdef CONFIG_ARM64_4K_PAGES +#define IOREMAP_MAX_ORDER (PUD_SHIFT) +#else +#define IOREMAP_MAX_ORDER (PMD_SHIFT) +#endif + #ifndef __ASSEMBLY__ extern phys_addr_t memstart_addr; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8d0e49d2cba9..1f4f4d21457f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -745,3 +745,44 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return dt_virt; } + +int __init arch_ioremap_pud_supported(void) +{ + /* only 4k granule supports level 1 block mappings */ + return IS_ENABLED(CONFIG_ARM64_4K_PAGES); +} + +int __init arch_ioremap_pmd_supported(void) +{ + return 1; +} + +int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot) +{ + BUG_ON(phys & ~PUD_MASK); + set_pud(pud, __pud(phys | PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot)))); + return 1; +} + +int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot) +{ + BUG_ON(phys & ~PMD_MASK); + set_pmd(pmd, __pmd(phys | PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot)))); + return 1; +} + +int pud_clear_huge(pud_t *pud) +{ + if (!pud_sect(*pud)) + return 0; + pud_clear(pud); + return 1; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + if (!pmd_sect(*pmd)) + return 0; + pmd_clear(pmd); + return 1; +} From c8d7bc708e8d0c197c3a0e3197bfb7f283ef352a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:36 +0100 Subject: [PATCH 0419/3220] UPSTREAM: arm64: introduce KIMAGE_VADDR as the virtual base of the kernel region This introduces the preprocessor symbol KIMAGE_VADDR which will serve as the symbolic virtual base of the kernel region, i.e., the kernel's virtual offset will be KIMAGE_VADDR + TEXT_OFFSET. For now, we define it as being equal to PAGE_OFFSET, but in the future, it will be moved below it once we move the kernel virtual mapping out of the linear mapping. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit ab893fb9f1b17f02139bce547bb4b69e96b9ae16) Signed-off-by: Jeff Vander Stoep Change-Id: I31427bd2b948a22bb8ce1d22109682fc66efb98d --- arch/arm64/include/asm/memory.h | 10 ++++++++-- arch/arm64/kernel/head.S | 2 +- arch/arm64/kernel/vmlinux.lds.S | 4 ++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index c65aad7b13dc..aebc739f5a11 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -51,7 +51,8 @@ #define VA_BITS (CONFIG_ARM64_VA_BITS) #define VA_START (UL(0xffffffffffffffff) << VA_BITS) #define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) -#define MODULES_END (PAGE_OFFSET) +#define KIMAGE_VADDR (PAGE_OFFSET) +#define MODULES_END (KIMAGE_VADDR) #define MODULES_VADDR (MODULES_END - SZ_64M) #define PCI_IO_END (MODULES_VADDR - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) @@ -75,8 +76,13 @@ * private definitions which should NOT be used outside memory.h * files. Use virt_to_phys/phys_to_virt/__pa/__va instead. */ -#define __virt_to_phys(x) (((phys_addr_t)(x) - PAGE_OFFSET + PHYS_OFFSET)) +#define __virt_to_phys(x) ({ \ + phys_addr_t __x = (phys_addr_t)(x); \ + __x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) : \ + (__x - KIMAGE_VADDR + PHYS_OFFSET); }) + #define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) +#define __phys_to_kimg(x) ((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR)) /* * Convert a page to/from a physical address diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 53b9f9f128c2..04d38a058b19 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -389,7 +389,7 @@ __create_page_tables: * Map the kernel image (starting with PHYS_OFFSET). */ mov x0, x26 // swapper_pg_dir - mov x5, #PAGE_OFFSET + ldr x5, =KIMAGE_VADDR create_pgd_entry x0, x5, x3, x6 ldr x6, =KERNEL_END // __va(KERNEL_END) mov x3, x24 // phys offset diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 763e5aab5752..7703feba23d6 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -89,7 +89,7 @@ SECTIONS *(.discard.*) } - . = PAGE_OFFSET + TEXT_OFFSET; + . = KIMAGE_VADDR + TEXT_OFFSET; .head.text : { _text = .; @@ -188,4 +188,4 @@ ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ -ASSERT(_text == (PAGE_OFFSET + TEXT_OFFSET), "HEAD is misaligned") +ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned") From 58539890b74ae227da611e91965c2e5ee2f57645 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:37 +0100 Subject: [PATCH 0420/3220] UPSTREAM: arm64: pgtable: implement static [pte|pmd|pud]_offset variants The page table accessors pte_offset(), pud_offset() and pmd_offset() rely on __va translations, so they can only be used after the linear mapping has been installed. For the early fixmap and kasan init routines, whose page tables are allocated statically in the kernel image, these functions will return bogus values. So implement pte_offset_kimg(), pmd_offset_kimg() and pud_offset_kimg(), which can be used instead before any page tables have been allocated dynamically. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 6533945a32c762c5db70d7a3ec251a040b2d9661) Signed-off-by: Jeff Vander Stoep Change-Id: Ibea400f0938db568524fb83eb2d22d8658bbb56b --- arch/arm64/include/asm/pgtable.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6a58dd48663f..f3e887d5376c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -453,6 +453,9 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) +/* use ONLY for statically allocated translation tables */ +#define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr)))) + /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. @@ -496,6 +499,9 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) +/* use ONLY for statically allocated translation tables */ +#define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr)))) + #else #define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) @@ -505,6 +511,8 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pmd_set_fixmap_offset(pudp, addr) ((pmd_t *)pudp) #define pmd_clear_fixmap() +#define pmd_offset_kimg(dir,addr) ((pmd_t *)dir) + #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -543,6 +551,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) +/* use ONLY for statically allocated translation tables */ +#define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) + #else #define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) @@ -552,6 +563,8 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pud_set_fixmap_offset(pgdp, addr) ((pud_t *)pgdp) #define pud_clear_fixmap() +#define pud_offset_kimg(dir,addr) ((pud_t *)dir) + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) From 3b177382c499864e0fd6a6a11236a616a7105be5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:38 +0100 Subject: [PATCH 0421/3220] UPSTREAM: arm64: decouple early fixmap init from linear mapping Since the early fixmap page tables are populated using pages that are part of the static footprint of the kernel, they are covered by the initial kernel mapping, and we can refer to them without using __va/__pa translations, which are tied to the linear mapping. Since the fixmap page tables are disjoint from the kernel mapping up to the top level pgd entry, we can refer to bm_pte[] directly, and there is no need to walk the page tables and perform __pa()/__va() translations at each step. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 157962f5a8f236cab898b68bdaa69ce68922f0bf) Signed-off-by: Jeff Vander Stoep Change-Id: I49221a199962aec6d4f3712bfb3dd041d64ba99b --- arch/arm64/mm/mmu.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1f4f4d21457f..84ef01d7aaf8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -614,7 +614,7 @@ static inline pud_t * fixmap_pud(unsigned long addr) BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd)); - return pud_offset(pgd, addr); + return pud_offset_kimg(pgd, addr); } static inline pmd_t * fixmap_pmd(unsigned long addr) @@ -623,16 +623,12 @@ static inline pmd_t * fixmap_pmd(unsigned long addr) BUG_ON(pud_none(*pud) || pud_bad(*pud)); - return pmd_offset(pud, addr); + return pmd_offset_kimg(pud, addr); } static inline pte_t * fixmap_pte(unsigned long addr) { - pmd_t *pmd = fixmap_pmd(addr); - - BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd)); - - return pte_offset_kernel(pmd, addr); + return &bm_pte[pte_index(addr)]; } void __init early_fixmap_init(void) @@ -644,14 +640,14 @@ void __init early_fixmap_init(void) pgd = pgd_offset_k(addr); pgd_populate(&init_mm, pgd, bm_pud); - pud = pud_offset(pgd, addr); + pud = fixmap_pud(addr); pud_populate(&init_mm, pud, bm_pmd); - pmd = pmd_offset(pud, addr); + pmd = fixmap_pmd(addr); pmd_populate_kernel(&init_mm, pmd, bm_pte); /* * The boot-ioremap range spans multiple pmds, for which - * we are not preparted: + * we are not prepared: */ BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); From 1ca6787a7310da94e9bf0846b0a2e1a9e9a91291 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:39 +0100 Subject: [PATCH 0422/3220] BACKPORT: arm64: kvm: deal with kernel symbols outside of linear mapping KVM on arm64 uses a fixed offset between the linear mapping at EL1 and the HYP mapping at EL2. Before we can move the kernel virtual mapping out of the linear mapping, we have to make sure that references to kernel symbols that are accessed via the HYP mapping are translated to their linear equivalent. Reviewed-by: Mark Rutland Acked-by: Marc Zyngier Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a0bf9776cd0be4490d4675d4108e13379849fc7f) Signed-off-by: Jeff Vander Stoep Change-Id: I316f029d22a16773c168a151dba59bed7921fa7e --- arch/arm/include/asm/kvm_asm.h | 2 ++ arch/arm/kvm/arm.c | 5 +++-- arch/arm64/include/asm/kvm_asm.h | 17 +++++++++++++++++ arch/arm64/include/asm/kvm_host.h | 8 +++++--- arch/arm64/kvm/hyp.S | 6 +++--- 5 files changed, 30 insertions(+), 8 deletions(-) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 194c91b610ff..c35c349da069 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -79,6 +79,8 @@ #define rr_lo_hi(a1, a2) a1, a2 #endif +#define kvm_ksym_ref(kva) (kva) + #ifndef __ASSEMBLY__ struct kvm; struct kvm_vcpu; diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index e06fd299de08..70e6d557c75f 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -969,7 +969,7 @@ static void cpu_init_hyp_mode(void *dummy) pgd_ptr = kvm_mmu_get_httbr(); stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; - vector_ptr = (unsigned long)__kvm_hyp_vector; + vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector); __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr); @@ -1061,7 +1061,8 @@ static int init_hyp_mode(void) /* * Map the Hyp-code called directly from the host */ - err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end); + err = create_hyp_mappings(kvm_ksym_ref(__kvm_hyp_code_start), + kvm_ksym_ref(__kvm_hyp_code_end)); if (err) { kvm_err("Cannot map world-switch code\n"); goto out_free_mappings; diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 5e377101f919..e95c39543629 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -102,7 +102,24 @@ #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) +#define kvm_ksym_ref(sym) ((void *)&sym + kvm_ksym_shift) + #ifndef __ASSEMBLY__ +#if __GNUC__ > 4 +#define kvm_ksym_shift (PAGE_OFFSET - KIMAGE_VADDR) +#else +/* + * GCC versions 4.9 and older will fold the constant below into the addend of + * the reference to 'sym' above if kvm_ksym_shift is declared static or if the + * constant is used directly. However, since we use the small code model for + * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair, + * with a +/- 4 GB range, resulting in linker relocation errors if the shift + * is sufficiently large. So prevent the compiler from folding the shift into + * the addend, by making the shift a variable with external linkage. + */ +__weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR; +#endif + struct kvm; struct kvm_vcpu; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a35ce7266aac..90c6368ad7c8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -222,7 +222,7 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); -u64 kvm_call_hyp(void *hypfn, ...); +u64 __kvm_call_hyp(void *hypfn, ...); void force_vm_exit(const cpumask_t *mask); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); @@ -243,8 +243,8 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, * Call initialization code, and switch to the full blown * HYP code. */ - kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr, - hyp_stack_ptr, vector_ptr); + __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr, + hyp_stack_ptr, vector_ptr); } static inline void kvm_arch_hardware_disable(void) {} @@ -258,4 +258,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); +#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 86c289832272..309e3479dc2c 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -923,7 +923,7 @@ __hyp_panic_str: .align 2 /* - * u64 kvm_call_hyp(void *hypfn, ...); + * u64 __kvm_call_hyp(void *hypfn, ...); * * This is not really a variadic function in the classic C-way and care must * be taken when calling this to ensure parameters are passed in registers @@ -940,10 +940,10 @@ __hyp_panic_str: * used to implement __hyp_get_vectors in the same way as in * arch/arm64/kernel/hyp_stub.S. */ -ENTRY(kvm_call_hyp) +ENTRY(__kvm_call_hyp) hvc #0 ret -ENDPROC(kvm_call_hyp) +ENDPROC(__kvm_call_hyp) .macro invalid_vector label, target .align 2 From c3a32b2934bd3190c651fb53ad734b296a45f15e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:40 +0100 Subject: [PATCH 0423/3220] UPSTREAM: arm64: move kernel image to base of vmalloc area This moves the module area to right before the vmalloc area, and moves the kernel image to the base of the vmalloc area. This is an intermediate step towards implementing KASLR, which allows the kernel image to be located anywhere in the vmalloc area. Since other subsystems such as hibernate may still need to refer to the kernel text or data segments via their linears addresses, both are mapped in the linear region as well. The linear alias of the text region is mapped read-only/non-executable to prevent inadvertent modification or execution. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit f9040773b7bbbd9e98eb6184a263512a7cfc133f) Signed-off-by: Jeff Vander Stoep Change-Id: I698faed47bb7cfc256a1b5b5407a7c586bdc63b3 --- arch/arm64/include/asm/kasan.h | 2 +- arch/arm64/include/asm/memory.h | 21 ++++-- arch/arm64/include/asm/pgtable.h | 10 +-- arch/arm64/mm/dump.c | 12 ++-- arch/arm64/mm/init.c | 23 +++---- arch/arm64/mm/kasan_init.c | 27 +++++++- arch/arm64/mm/mmu.c | 110 +++++++++++++++++++++---------- 7 files changed, 137 insertions(+), 68 deletions(-) diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h index de0d21211c34..71ad0f93eb71 100644 --- a/arch/arm64/include/asm/kasan.h +++ b/arch/arm64/include/asm/kasan.h @@ -14,7 +14,7 @@ * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/8 of kernel virtual addresses. */ #define KASAN_SHADOW_START (VA_START) -#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1UL << (VA_BITS - 3))) +#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) /* * This value is used to map an address to the corresponding shadow diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index aebc739f5a11..4388651d1f0d 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -45,16 +45,15 @@ * VA_START - the first kernel virtual address. * TASK_SIZE - the maximum size of a user space task. * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. - * The module space lives between the addresses given by TASK_SIZE - * and PAGE_OFFSET - it must be within 128MB of the kernel text. */ #define VA_BITS (CONFIG_ARM64_VA_BITS) #define VA_START (UL(0xffffffffffffffff) << VA_BITS) #define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) -#define KIMAGE_VADDR (PAGE_OFFSET) -#define MODULES_END (KIMAGE_VADDR) -#define MODULES_VADDR (MODULES_END - SZ_64M) -#define PCI_IO_END (MODULES_VADDR - SZ_2M) +#define KIMAGE_VADDR (MODULES_END) +#define MODULES_END (MODULES_VADDR + MODULES_VSIZE) +#define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE) +#define MODULES_VSIZE (SZ_64M) +#define PCI_IO_END (PAGE_OFFSET - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) #define FIXADDR_TOP (PCI_IO_START - SZ_2M) #define TASK_SIZE_64 (UL(1) << VA_BITS) @@ -71,6 +70,16 @@ #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) +/* + * The size of the KASAN shadow region. This should be 1/8th of the + * size of the entire kernel virtual address space. + */ +#ifdef CONFIG_KASAN +#define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - 3)) +#else +#define KASAN_SHADOW_SIZE (0) +#endif + /* * Physical vs virtual RAM address space conversion. These are * private definitions which should NOT be used outside memory.h diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f3e887d5376c..9bf72b72b76d 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -36,19 +36,13 @@ * * VMEMAP_SIZE: allows the whole VA space to be covered by a struct page array * (rounded up to PUD_SIZE). - * VMALLOC_START: beginning of the kernel VA space + * VMALLOC_START: beginning of the kernel vmalloc space * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space, * fixed mappings and modules */ #define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) -#ifndef CONFIG_KASAN -#define VMALLOC_START (VA_START) -#else -#include -#define VMALLOC_START (KASAN_SHADOW_END + SZ_64K) -#endif - +#define VMALLOC_START (MODULES_END) #define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) #define vmemmap ((struct page *)(VMALLOC_END + SZ_64K)) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 0841b2bf0e6a..6be918478f85 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -35,7 +35,9 @@ struct addr_marker { }; enum address_markers_idx { - VMALLOC_START_NR = 0, + MODULES_START_NR = 0, + MODULES_END_NR, + VMALLOC_START_NR, VMALLOC_END_NR, #ifdef CONFIG_SPARSEMEM_VMEMMAP VMEMMAP_START_NR, @@ -45,12 +47,12 @@ enum address_markers_idx { FIXADDR_END_NR, PCI_START_NR, PCI_END_NR, - MODULES_START_NR, - MODULES_END_NR, KERNEL_SPACE_NR, }; static struct addr_marker address_markers[] = { + { MODULES_VADDR, "Modules start" }, + { MODULES_END, "Modules end" }, { VMALLOC_START, "vmalloc() Area" }, { VMALLOC_END, "vmalloc() End" }, #ifdef CONFIG_SPARSEMEM_VMEMMAP @@ -61,9 +63,7 @@ static struct addr_marker address_markers[] = { { FIXADDR_TOP, "Fixmap end" }, { PCI_IO_START, "PCI I/O start" }, { PCI_IO_END, "PCI I/O end" }, - { MODULES_VADDR, "Modules start" }, - { MODULES_END, "Modules end" }, - { PAGE_OFFSET, "Kernel Mapping" }, + { PAGE_OFFSET, "Linear Mapping" }, { -1, NULL }, }; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index f13078bbf66e..46ae88e0744e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -304,22 +305,26 @@ void __init mem_init(void) #ifdef CONFIG_KASAN " kasan : 0x%16lx - 0x%16lx (%6ld GB)\n" #endif + " modules : 0x%16lx - 0x%16lx (%6ld MB)\n" " vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n" + " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" + " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" + " .data : 0x%p" " - 0x%p" " (%6ld KB)\n" #ifdef CONFIG_SPARSEMEM_VMEMMAP " vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n" " 0x%16lx - 0x%16lx (%6ld MB actual)\n" #endif " fixed : 0x%16lx - 0x%16lx (%6ld KB)\n" " PCI I/O : 0x%16lx - 0x%16lx (%6ld MB)\n" - " modules : 0x%16lx - 0x%16lx (%6ld MB)\n" - " memory : 0x%16lx - 0x%16lx (%6ld MB)\n" - " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" - " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" - " .data : 0x%p" " - 0x%p" " (%6ld KB)\n", + " memory : 0x%16lx - 0x%16lx (%6ld MB)\n", #ifdef CONFIG_KASAN MLG(KASAN_SHADOW_START, KASAN_SHADOW_END), #endif + MLM(MODULES_VADDR, MODULES_END), MLG(VMALLOC_START, VMALLOC_END), + MLK_ROUNDUP(__init_begin, __init_end), + MLK_ROUNDUP(_text, _etext), + MLK_ROUNDUP(_sdata, _edata), #ifdef CONFIG_SPARSEMEM_VMEMMAP MLG((unsigned long)vmemmap, (unsigned long)vmemmap + VMEMMAP_SIZE), @@ -328,11 +333,7 @@ void __init mem_init(void) #endif MLK(FIXADDR_START, FIXADDR_TOP), MLM(PCI_IO_START, PCI_IO_END), - MLM(MODULES_VADDR, MODULES_END), - MLM(PAGE_OFFSET, (unsigned long)high_memory), - MLK_ROUNDUP(__init_begin, __init_end), - MLK_ROUNDUP(_text, _etext), - MLK_ROUNDUP(_sdata, _edata)); + MLM(PAGE_OFFSET, (unsigned long)high_memory)); #undef MLK #undef MLM @@ -360,8 +361,8 @@ void __init mem_init(void) void free_initmem(void) { - fixup_init(); free_initmem_default(0); + fixup_init(); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index cc569a38bc76..7f10cc91fa8a 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -17,9 +17,11 @@ #include #include +#include #include #include #include +#include #include static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); @@ -33,7 +35,7 @@ static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr, if (pmd_none(*pmd)) pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); - pte = pte_offset_kernel(pmd, addr); + pte = pte_offset_kimg(pmd, addr); do { next = addr + PAGE_SIZE; set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page), @@ -51,7 +53,7 @@ static void __init kasan_early_pmd_populate(pud_t *pud, if (pud_none(*pud)) pud_populate(&init_mm, pud, kasan_zero_pmd); - pmd = pmd_offset(pud, addr); + pmd = pmd_offset_kimg(pud, addr); do { next = pmd_addr_end(addr, end); kasan_early_pte_populate(pmd, addr, next); @@ -68,7 +70,7 @@ static void __init kasan_early_pud_populate(pgd_t *pgd, if (pgd_none(*pgd)) pgd_populate(&init_mm, pgd, kasan_zero_pud); - pud = pud_offset(pgd, addr); + pud = pud_offset_kimg(pgd, addr); do { next = pud_addr_end(addr, end); kasan_early_pmd_populate(pud, addr, next); @@ -126,9 +128,13 @@ static void __init clear_pgds(unsigned long start, void __init kasan_init(void) { + u64 kimg_shadow_start, kimg_shadow_end; struct memblock_region *reg; int i; + kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); + kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); + /* * We are going to perform proper setup of shadow memory. * At first we should unmap early shadow (clear_pgds() call bellow). @@ -142,8 +148,23 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); + vmemmap_populate(kimg_shadow_start, kimg_shadow_end, NUMA_NO_NODE); + + /* + * vmemmap_populate() has populated the shadow region that covers the + * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round + * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent + * kasan_populate_zero_shadow() from replacing the PMD block mappings + * with PMD table mappings at the edges of the shadow region for the + * kernel image. + */ + if (ARM64_SWAPPER_USES_SECTION_MAPS) + kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); + kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, kasan_mem_to_shadow((void *)MODULES_VADDR)); + kasan_populate_zero_shadow((void *)kimg_shadow_end, + kasan_mem_to_shadow((void *)PAGE_OFFSET)); for_each_memblock(memory, reg) { void *start = (void *)__phys_to_virt(reg->base); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 84ef01d7aaf8..de7a8627928a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -53,6 +53,10 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS); unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); +static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; +static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused; +static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused; + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -380,16 +384,15 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { - unsigned long kernel_start = __pa(_stext); - unsigned long kernel_end = __pa(_end); + unsigned long kernel_end = __pa(_etext); /* - * The kernel itself is mapped at page granularity. Map all other - * memory, making sure we don't overwrite the existing kernel mappings. + * Take care not to create a writable alias for the + * read-only text and rodata sections of the kernel image. */ - /* No overlap with the kernel. */ + /* No overlap with the kernel text */ if (end < kernel_start || start >= kernel_end) { __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, @@ -398,8 +401,8 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end } /* - * This block overlaps the kernel mapping. Map the portion(s) which - * don't overlap. + * This block overlaps the kernel text mapping. + * Map the portion(s) which don't overlap. */ if (start < kernel_start) __create_pgd_mapping(pgd, start, @@ -411,6 +414,16 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __phys_to_virt(kernel_end), end - kernel_end, PAGE_KERNEL, early_pgtable_alloc); + + /* + * Map the linear alias of the [_stext, _etext) interval as + * read-only/non-executable. This makes the contents of the + * region accessible to subsystems such as hibernate, but + * protects it from inadvertent modification or execution. + */ + __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), + kernel_end - kernel_start, PAGE_KERNEL_RO, + early_pgtable_alloc); } static void __init map_mem(pgd_t *pgd) @@ -429,25 +442,28 @@ static void __init map_mem(pgd_t *pgd) } } -#ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { + if (!IS_ENABLED(CONFIG_DEBUG_RODATA)) + return; + create_mapping_late(__pa(_stext), (unsigned long)_stext, (unsigned long)__init_begin - (unsigned long)_stext, PAGE_KERNEL_ROX); - } -#endif void fixup_init(void) { - create_mapping_late(__pa(__init_begin), (unsigned long)__init_begin, - (unsigned long)__init_end - (unsigned long)__init_begin, - PAGE_KERNEL); + /* + * Unmap the __init region but leave the VM area in place. This + * prevents the region from being reused for kernel modules, which + * is not supported by kallsyms. + */ + unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin)); } static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, - pgprot_t prot) + pgprot_t prot, struct vm_struct *vma) { phys_addr_t pa_start = __pa(va_start); unsigned long size = va_end - va_start; @@ -457,6 +473,14 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, early_pgtable_alloc); + + vma->addr = va_start; + vma->phys_addr = pa_start; + vma->size = size; + vma->flags = VM_MAP; + vma->caller = __builtin_return_address(0); + + vm_area_add_early(vma); } /* @@ -464,17 +488,35 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, */ static void __init map_kernel(pgd_t *pgd) { + static struct vm_struct vmlinux_text, vmlinux_init, vmlinux_data; - map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC); - map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC); - map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL); + map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); + map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, + &vmlinux_init); + map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); - /* - * The fixmap falls in a separate pgd to the kernel, and doesn't live - * in the carveout for the swapper_pg_dir. We can simply re-use the - * existing dir for the fixmap. - */ - set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START)); + if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) { + /* + * The fixmap falls in a separate pgd to the kernel, and doesn't + * live in the carveout for the swapper_pg_dir. We can simply + * re-use the existing dir for the fixmap. + */ + set_pgd(pgd_offset_raw(pgd, FIXADDR_START), + *pgd_offset_k(FIXADDR_START)); + } else if (CONFIG_PGTABLE_LEVELS > 3) { + /* + * The fixmap shares its top level pgd entry with the kernel + * mapping. This can really only occur when we are running + * with 16k/4 levels, so we can simply reuse the pud level + * entry instead. + */ + BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); + set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START), + __pud(__pa(bm_pmd) | PUD_TYPE_TABLE)); + pud_clear_fixmap(); + } else { + BUG(); + } kasan_copy_shadow(pgd); } @@ -600,14 +642,6 @@ void vmemmap_free(unsigned long start, unsigned long end) } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; -#if CONFIG_PGTABLE_LEVELS > 2 -static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss; -#endif -#if CONFIG_PGTABLE_LEVELS > 3 -static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss; -#endif - static inline pud_t * fixmap_pud(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); @@ -639,8 +673,18 @@ void __init early_fixmap_init(void) unsigned long addr = FIXADDR_START; pgd = pgd_offset_k(addr); - pgd_populate(&init_mm, pgd, bm_pud); - pud = fixmap_pud(addr); + if (CONFIG_PGTABLE_LEVELS > 3 && !pgd_none(*pgd)) { + /* + * We only end up here if the kernel mapping and the fixmap + * share the top level pgd entry, which should only happen on + * 16k/4 levels configurations. + */ + BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); + pud = pud_offset_kimg(pgd, addr); + } else { + pgd_populate(&init_mm, pgd, bm_pud); + pud = fixmap_pud(addr); + } pud_populate(&init_mm, pud, bm_pmd); pmd = fixmap_pmd(addr); pmd_populate_kernel(&init_mm, pmd, bm_pte); From 23b3715e4cc4262ba8fedabfe67e898549696126 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:41 +0100 Subject: [PATCH 0424/3220] UPSTREAM: arm64: defer __va translation of initrd_start and initrd_end Before deferring the assignment of memstart_addr in a subsequent patch, to the moment where all memory has been discovered and possibly clipped based on the size of the linear region and the presence of a mem= command line parameter, we need to ensure that memstart_addr is not used to perform __va translations before it is assigned. One such use is in the generic early DT discovery of the initrd location, which is recorded as a virtual address in the globals initrd_start and initrd_end. So wire up the generic support to declare the initrd addresses, and implement it without __va() translations, and perform the translation after memstart_addr has been assigned. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a89dea585371a9d5d85499db47c93f129be8e0c4) Signed-off-by: Jeff Vander Stoep Change-Id: I7d0b3dd7adcf069d4e7c1f58fd12e59c4cb62017 --- arch/arm64/include/asm/memory.h | 8 ++++++++ arch/arm64/mm/init.c | 13 +++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 4388651d1f0d..18b7e77c7495 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -121,6 +121,14 @@ #define IOREMAP_MAX_ORDER (PMD_SHIFT) #endif +#ifdef CONFIG_BLK_DEV_INITRD +#define __early_init_dt_declare_initrd(__start, __end) \ + do { \ + initrd_start = (__start); \ + initrd_end = (__end); \ + } while (0) +#endif + #ifndef __ASSEMBLY__ extern phys_addr_t memstart_addr; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 46ae88e0744e..16e18d35eb04 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -59,8 +59,8 @@ static int __init early_initrd(char *p) if (*endp == ',') { size = memparse(endp + 1, NULL); - initrd_start = (unsigned long)__va(start); - initrd_end = (unsigned long)__va(start + size); + initrd_start = start; + initrd_end = start + size; } return 0; } @@ -170,8 +170,13 @@ void __init arm64_memblock_init(void) */ memblock_reserve(__pa(_text), _end - _text); #ifdef CONFIG_BLK_DEV_INITRD - if (initrd_start) - memblock_reserve(__virt_to_phys(initrd_start), initrd_end - initrd_start); + if (initrd_start) { + memblock_reserve(initrd_start, initrd_end - initrd_start); + + /* the generic initrd code expects virtual addresses */ + initrd_start = __phys_to_virt(initrd_start); + initrd_end = __phys_to_virt(initrd_end); + } #endif early_init_fdt_scan_reserved_mem(); From cb4d47d9a2e863d38c1a2c81b4f2c5181cf1c8df Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:42 +0100 Subject: [PATCH 0425/3220] UPSTREAM: arm64: allow kernel Image to be loaded anywhere in physical memory This relaxes the kernel Image placement requirements, so that it may be placed at any 2 MB aligned offset in physical memory. This is accomplished by ignoring PHYS_OFFSET when installing memblocks, and accounting for the apparent virtual offset of the kernel Image. As a result, virtual address references below PAGE_OFFSET are correctly mapped onto physical references into the kernel Image regardless of where it sits in memory. Special care needs to be taken for dealing with memory limits passed via mem=, since the generic implementation clips memory top down, which may clip the kernel image itself if it is loaded high up in memory. To deal with this case, we simply add back the memory covering the kernel image, which may result in more memory to be retained than was passed as a mem= parameter. Since mem= should not be considered a production feature, a panic notifier handler is installed that dumps the memory limit at panic time if one was set. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a7f8de168ace487fa7b88cb154e413cf40e87fc6) Signed-off-by: Jeff Vander Stoep Change-Id: I1d28cb66b658ef89f9648918565ddc07df4660f8 --- Documentation/arm64/booting.txt | 20 +++++--- arch/arm64/include/asm/boot.h | 6 +++ arch/arm64/include/asm/kernel-pgtable.h | 12 +++++ arch/arm64/include/asm/kvm_asm.h | 17 +------ arch/arm64/include/asm/memory.h | 18 +++---- arch/arm64/kernel/head.S | 6 ++- arch/arm64/kernel/image.h | 13 +++-- arch/arm64/mm/init.c | 63 ++++++++++++++++++++++++- arch/arm64/mm/mmu.c | 3 ++ 9 files changed, 119 insertions(+), 39 deletions(-) diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt index 701d39d3171a..56d6d8b796db 100644 --- a/Documentation/arm64/booting.txt +++ b/Documentation/arm64/booting.txt @@ -109,7 +109,13 @@ Header notes: 1 - 4K 2 - 16K 3 - 64K - Bits 3-63: Reserved. + Bit 3: Kernel physical placement + 0 - 2MB aligned base should be as close as possible + to the base of DRAM, since memory below it is not + accessible via the linear mapping + 1 - 2MB aligned base may be anywhere in physical + memory + Bits 4-63: Reserved. - When image_size is zero, a bootloader should attempt to keep as much memory as possible free for use by the kernel immediately after the @@ -117,14 +123,14 @@ Header notes: depending on selected features, and is effectively unbound. The Image must be placed text_offset bytes from a 2MB aligned base -address near the start of usable system RAM and called there. Memory -below that base address is currently unusable by Linux, and therefore it -is strongly recommended that this location is the start of system RAM. -The region between the 2 MB aligned base address and the start of the -image has no special significance to the kernel, and may be used for -other purposes. +address anywhere in usable system RAM and called there. The region +between the 2 MB aligned base address and the start of the image has no +special significance to the kernel, and may be used for other purposes. At least image_size bytes from the start of the image must be free for use by the kernel. +NOTE: versions prior to v4.6 cannot make use of memory below the +physical offset of the Image so it is recommended that the Image be +placed as close as possible to the start of system RAM. Any memory described to the kernel (even that below the start of the image) which is not marked as reserved from the kernel (e.g., with a diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h index 81151b67b26b..ebf2481889c3 100644 --- a/arch/arm64/include/asm/boot.h +++ b/arch/arm64/include/asm/boot.h @@ -11,4 +11,10 @@ #define MIN_FDT_ALIGN 8 #define MAX_FDT_SIZE SZ_2M +/* + * arm64 requires the kernel image to placed + * TEXT_OFFSET bytes beyond a 2 MB aligned base + */ +#define MIN_KIMG_ALIGN SZ_2M + #endif diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index a459714ee29e..5c6375d8528b 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -79,5 +79,17 @@ #define SWAPPER_MM_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS) #endif +/* + * To make optimal use of block mappings when laying out the linear + * mapping, round down the base of physical memory to a size that can + * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE + * (64k granule), or a multiple that can be mapped using contiguous bits + * in the page tables: 32 * PMD_SIZE (16k granule) + */ +#ifdef CONFIG_ARM64_64K_PAGES +#define ARM64_MEMSTART_ALIGN SZ_512M +#else +#define ARM64_MEMSTART_ALIGN SZ_1G +#endif #endif /* __ASM_KERNEL_PGTABLE_H */ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index e95c39543629..419bc6661b5c 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -102,24 +102,9 @@ #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) -#define kvm_ksym_ref(sym) ((void *)&sym + kvm_ksym_shift) +#define kvm_ksym_ref(sym) phys_to_virt((u64)&sym - kimage_voffset) #ifndef __ASSEMBLY__ -#if __GNUC__ > 4 -#define kvm_ksym_shift (PAGE_OFFSET - KIMAGE_VADDR) -#else -/* - * GCC versions 4.9 and older will fold the constant below into the addend of - * the reference to 'sym' above if kvm_ksym_shift is declared static or if the - * constant is used directly. However, since we use the small code model for - * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair, - * with a +/- 4 GB range, resulting in linker relocation errors if the shift - * is sufficiently large. So prevent the compiler from folding the shift into - * the addend, by making the shift a variable with external linkage. - */ -__weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR; -#endif - struct kvm; struct kvm_vcpu; diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 18b7e77c7495..3239e4d78e0d 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -24,6 +24,7 @@ #include #include #include +#include #include /* @@ -88,10 +89,10 @@ #define __virt_to_phys(x) ({ \ phys_addr_t __x = (phys_addr_t)(x); \ __x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) : \ - (__x - KIMAGE_VADDR + PHYS_OFFSET); }) + (__x - kimage_voffset); }) #define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) -#define __phys_to_kimg(x) ((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR)) +#define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset)) /* * Convert a page to/from a physical address @@ -133,15 +134,16 @@ extern phys_addr_t memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ -#define PHYS_OFFSET ({ memstart_addr; }) +#define PHYS_OFFSET ({ BUG_ON(memstart_addr & 1); memstart_addr; }) + +/* the offset between the kernel virtual and physical mappings */ +extern u64 kimage_voffset; /* - * The maximum physical address that the linear direct mapping - * of system RAM can cover. (PAGE_OFFSET can be interpreted as - * a 2's complement signed quantity and negated to derive the - * maximum size of the linear mapping.) + * Allow all memory at the discovery stage. We will clip it later. */ -#define MAX_MEMBLOCK_ADDR ({ memstart_addr - PAGE_OFFSET - 1; }) +#define MIN_MEMBLOCK_ADDR 0 +#define MAX_MEMBLOCK_ADDR U64_MAX /* * PFNs are used to describe any physical page; this means diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 04d38a058b19..05b98289093e 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -428,7 +428,11 @@ __mmap_switched: and x4, x4, #~(THREAD_SIZE - 1) msr sp_el0, x4 // Save thread_info str_l x21, __fdt_pointer, x5 // Save FDT pointer - str_l x24, memstart_addr, x6 // Save PHYS_OFFSET + + ldr x4, =KIMAGE_VADDR // Save the offset between + sub x4, x4, x24 // the kernel virtual and + str_l x4, kimage_voffset, x5 // physical mappings + mov x29, #0 #ifdef CONFIG_KASAN bl kasan_early_init diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index 999633bd7294..c9c62cab25a4 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -42,15 +42,18 @@ #endif #ifdef CONFIG_CPU_BIG_ENDIAN -#define __HEAD_FLAG_BE 1 +#define __HEAD_FLAG_BE 1 #else -#define __HEAD_FLAG_BE 0 +#define __HEAD_FLAG_BE 0 #endif -#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2) +#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2) -#define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \ - (__HEAD_FLAG_PAGE_SIZE << 1)) +#define __HEAD_FLAG_PHYS_BASE 1 + +#define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \ + (__HEAD_FLAG_PAGE_SIZE << 1) | \ + (__HEAD_FLAG_PHYS_BASE << 3)) /* * These will output as part of the Image header, which should be little-endian diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 16e18d35eb04..e333f7f0b4f2 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -35,8 +35,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -46,7 +48,13 @@ #include "mm.h" -phys_addr_t memstart_addr __read_mostly = 0; +/* + * We need to be able to catch inadvertent references to memstart_addr + * that occur (potentially in generic code) before arm64_memblock_init() + * executes, which assigns it its actual value. So use a default value + * that cannot be mistaken for a real physical address. + */ +phys_addr_t memstart_addr __read_mostly = ~0ULL; phys_addr_t arm64_dma_phys_limit __read_mostly; #ifdef CONFIG_BLK_DEV_INITRD @@ -162,7 +170,33 @@ early_param("mem", early_mem); void __init arm64_memblock_init(void) { - memblock_enforce_memory_limit(memory_limit); + const s64 linear_region_size = -(s64)PAGE_OFFSET; + + /* + * Select a suitable value for the base of physical memory. + */ + memstart_addr = round_down(memblock_start_of_DRAM(), + ARM64_MEMSTART_ALIGN); + + /* + * Remove the memory that we will not be able to cover with the + * linear mapping. Take care not to clip the kernel which may be + * high in memory. + */ + memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)), + ULLONG_MAX); + if (memblock_end_of_DRAM() > linear_region_size) + memblock_remove(0, memblock_end_of_DRAM() - linear_region_size); + + /* + * Apply the memory limit if it was set. Since the kernel may be loaded + * high up in memory, add back the kernel region that must be accessible + * via the linear mapping. + */ + if (memory_limit != (phys_addr_t)ULLONG_MAX) { + memblock_enforce_memory_limit(memory_limit); + memblock_add(__pa(_text), (u64)(_end - _text)); + } /* * Register the kernel text, kernel data, initrd, and initial @@ -388,3 +422,28 @@ static int __init keepinitrd_setup(char *__unused) __setup("keepinitrd", keepinitrd_setup); #endif + +/* + * Dump out memory limit information on panic. + */ +static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p) +{ + if (memory_limit != (phys_addr_t)ULLONG_MAX) { + pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20); + } else { + pr_emerg("Memory Limit: none\n"); + } + return 0; +} + +static struct notifier_block mem_limit_notifier = { + .notifier_call = dump_mem_limit, +}; + +static int __init register_mem_limit_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &mem_limit_notifier); + return 0; +} +__initcall(register_mem_limit_dumper); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index de7a8627928a..8577437b6c0a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -46,6 +46,9 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS); +u64 kimage_voffset __read_mostly; +EXPORT_SYMBOL(kimage_voffset); + /* * Empty_zero_page is a special page that is used for zero-initialized data * and COW. From 298b730c260f6766a31ac4b3fa203d7d251a9cbd Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 19 Feb 2016 14:28:58 +0000 Subject: [PATCH 0426/3220] UPSTREAM: arm64: User die() instead of panic() in do_page_fault() The former gives better error reporting on unhandled permission faults (introduced by the UAO patches). Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 70c8abc28762d04e36c92e07eee2ce6ab41049cb) Signed-off-by: Jeff Vander Stoep Change-Id: Ia419eccf1554a32fa4131ac15b277d4d2d4eb508 --- arch/arm64/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a8eafeceb08a..44e56de23f79 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -235,10 +235,10 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (permission_fault(esr) && (addr < USER_DS)) { if (get_fs() == KERNEL_DS) - panic("Accessing user space memory with fs=KERNEL_DS"); + die("Accessing user space memory with fs=KERNEL_DS", regs, esr); if (!search_exception_tables(regs->pc)) - panic("Accessing user space memory outside uaccess.h routines"); + die("Accessing user space memory outside uaccess.h routines", regs, esr); } /* From 477376b452f0d5fc564dfe2d0fc0fe8156051bb1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 22 Feb 2016 18:46:03 +0100 Subject: [PATCH 0427/3220] UPSTREAM: arm64: mm: only perform memstart_addr sanity check if DEBUG_VM Checking whether memstart_addr has been assigned every time it is referenced adds a branch instruction that may hurt performance if the reference in question occurs on a hot path. So only perform the check if CONFIG_DEBUG_VM=y. Signed-off-by: Ard Biesheuvel [catalin.marinas@arm.com: replaced #ifdef with VM_BUG_ON] Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a92405f082d43267575444a6927085e4c8a69e4e) Signed-off-by: Jeff Vander Stoep Change-Id: Ia5f206d9a2dbbdbfc3f05fe985d4eca309f0d889 --- arch/arm64/include/asm/memory.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 3239e4d78e0d..460d09bf9442 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -132,9 +132,11 @@ #ifndef __ASSEMBLY__ +#include + extern phys_addr_t memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ -#define PHYS_OFFSET ({ BUG_ON(memstart_addr & 1); memstart_addr; }) +#define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) /* the offset between the kernel virtual and physical mappings */ extern u64 kimage_voffset; From b4fd0619fedde9b2849e844db1bde3929c336a82 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 22 Feb 2016 18:46:04 +0100 Subject: [PATCH 0428/3220] UPSTREAM: arm64: mm: use bit ops rather than arithmetic in pa/va translations Since PAGE_OFFSET is chosen such that it cuts the kernel VA space right in half, and since the size of the kernel VA space itself is always a power of 2, we can treat PAGE_OFFSET as a bitmask and replace the additions/subtractions with 'or' and 'and-not' operations. For the comparison against PAGE_OFFSET, a mov/cmp/branch sequence ends up getting replaced with a single tbz instruction. For the additions and subtractions, we save a mov instruction since the mask is folded into the instruction's immediate field. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 8439e62a15614e8fcd43835d57b7245cd9870dc5) Signed-off-by: Jeff Vander Stoep Change-Id: I1ea4ef654dd7b7693f8713dab28ca0739b8a2c62 --- arch/arm64/include/asm/memory.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 460d09bf9442..eb798156cf56 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -88,10 +88,10 @@ */ #define __virt_to_phys(x) ({ \ phys_addr_t __x = (phys_addr_t)(x); \ - __x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) : \ - (__x - kimage_voffset); }) + __x & BIT(VA_BITS - 1) ? (__x & ~PAGE_OFFSET) + PHYS_OFFSET : \ + (__x - kimage_voffset); }) -#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) +#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET) #define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset)) /* @@ -132,6 +132,7 @@ #ifndef __ASSEMBLY__ +#include #include extern phys_addr_t memstart_addr; From 0983513a290d07016a2b9c097157bec9685800ce Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 23 Feb 2016 08:56:45 +0100 Subject: [PATCH 0429/3220] UPSTREAM: arm64: move brk immediate argument definitions to separate header Instead of reversing the header dependency between asm/bug.h and asm/debug-monitors.h, split off the brk instruction immediate value defines into a new header asm/brk-imm.h, and include it from both. This solves the circular dependency issue that prevents BUG() from being used in some header files, and keeps the definitions together. Signed-off-by: Ard Biesheuvel Acked-by: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit f98deee9a9f8c47d05a0f64d86440882dca772ff) Signed-off-by: Jeff Vander Stoep Change-Id: Id4827af98ab3d413828c589bc379acecabeff108 --- arch/arm64/include/asm/brk-imm.h | 25 +++++++++++++++++++++++++ arch/arm64/include/asm/bug.h | 2 +- arch/arm64/include/asm/debug-monitors.h | 14 +------------- 3 files changed, 27 insertions(+), 14 deletions(-) create mode 100644 arch/arm64/include/asm/brk-imm.h diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h new file mode 100644 index 000000000000..ed693c5bcec0 --- /dev/null +++ b/arch/arm64/include/asm/brk-imm.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __ASM_BRK_IMM_H +#define __ASM_BRK_IMM_H + +/* + * #imm16 values used for BRK instruction generation + * Allowed values for kgdb are 0x400 - 0x7ff + * 0x100: for triggering a fault on purpose (reserved) + * 0x400: for dynamic BRK instruction + * 0x401: for compile time BRK instruction + * 0x800: kernel-mode BUG() and WARN() traps + */ +#define FAULT_BRK_IMM 0x100 +#define KGDB_DYN_DBG_BRK_IMM 0x400 +#define KGDB_COMPILED_DBG_BRK_IMM 0x401 +#define BUG_BRK_IMM 0x800 + +#endif diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 679d49221998..561190d15881 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -18,7 +18,7 @@ #ifndef _ARCH_ARM64_ASM_BUG_H #define _ARCH_ARM64_ASM_BUG_H -#define BUG_BRK_IMM 0x800 +#include #ifdef CONFIG_GENERIC_BUG #define HAVE_ARCH_BUG diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index e893a1fca9c2..2fcb9b7c876c 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include @@ -47,18 +47,6 @@ */ #define BREAK_INSTR_SIZE AARCH64_INSN_SIZE -/* - * #imm16 values used for BRK instruction generation - * Allowed values for kgbd are 0x400 - 0x7ff - * 0x100: for triggering a fault on purpose (reserved) - * 0x400: for dynamic BRK instruction - * 0x401: for compile time BRK instruction - * 0x800: kernel-mode BUG() and WARN() traps - */ -#define FAULT_BRK_IMM 0x100 -#define KGDB_DYN_DBG_BRK_IMM 0x400 -#define KGDB_COMPILED_DBG_BRK_IMM 0x401 - /* * BRK instruction encoding * The #imm16 value should be placed at bits[20:5] within BRK ins From 4166edc437c98ef74d6a1cf3a2b2fabeabca2b18 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 24 Nov 2015 12:37:35 +0100 Subject: [PATCH 0430/3220] UPSTREAM: arm64: add support for module PLTs This adds support for emitting PLTs at module load time for relative branches that are out of range. This is a prerequisite for KASLR, which may place the kernel and the modules anywhere in the vmalloc area, making it more likely that branch target offsets exceed the maximum range of +/- 128 MB. In this version, I removed the distinction between relocations against .init executable sections and ordinary executable sections. The reason is that it is hardly worth the trouble, given that .init.text usually does not contain that many far branches, and this version now only reserves PLT entry space for jump and call relocations against undefined symbols (since symbols defined in the same module can be assumed to be within +/- 128 MB) For example, the mac80211.ko module (which is fairly sizable at ~400 KB) built with -mcmodel=large gives the following relocation counts: relocs branches unique !local .text 3925 3347 518 219 .init.text 11 8 7 1 .exit.text 4 4 4 1 .text.unlikely 81 67 36 17 ('unique' means branches to unique type/symbol/addend combos, of which !local is the subset referring to undefined symbols) IOW, we are only emitting a single PLT entry for the .init sections, and we are better off just adding it to the core PLT section instead. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit fd045f6cd98ec4953147b318418bd45e441e52a3) Signed-off-by: Jeff Vander Stoep Change-Id: I1b46bb817e7d16a1b9a394b100c9e5de46c0837c --- arch/arm64/Kconfig | 9 ++ arch/arm64/Makefile | 6 +- arch/arm64/include/asm/module.h | 11 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/module-plts.c | 201 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/module.c | 22 ++++ arch/arm64/kernel/module.lds | 3 + 7 files changed, 252 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kernel/module-plts.c create mode 100644 arch/arm64/kernel/module.lds diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 14866e975277..65a8e7c9fdd1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -399,6 +399,7 @@ config ARM64_ERRATUM_843419 bool "Cortex-A53: 843419: A load or store might access an incorrect address" depends on MODULES default y + select ARM64_MODULE_CMODEL_LARGE help This option builds kernel modules using the large memory model in order to avoid the use of the ADRP instruction, which can cause @@ -762,6 +763,14 @@ config ARM64_UAO regular load/store instructions if the cpu does not implement the feature. +config ARM64_MODULE_CMODEL_LARGE + bool + +config ARM64_MODULE_PLTS + bool + select ARM64_MODULE_CMODEL_LARGE + select HAVE_MOD_ARCH_SPECIFIC + endmenu menu "Boot options" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 693a4dc6220c..763509ff6f24 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -43,10 +43,14 @@ endif CHECKFLAGS += -D__aarch64__ -ifeq ($(CONFIG_ARM64_ERRATUM_843419), y) +ifeq ($(CONFIG_ARM64_MODULE_CMODEL_LARGE), y) KBUILD_CFLAGS_MODULE += -mcmodel=large endif +ifeq ($(CONFIG_ARM64_MODULE_PLTS),y) +KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/arm64/kernel/module.lds +endif + # Default value head-y := arch/arm64/kernel/head.o diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index e80e232b730e..8652fb613304 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -20,4 +20,15 @@ #define MODULE_ARCH_VERMAGIC "aarch64" +#ifdef CONFIG_ARM64_MODULE_PLTS +struct mod_arch_specific { + struct elf64_shdr *plt; + int plt_num_entries; + int plt_max_entries; +}; +#endif + +u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, + Elf64_Sym *sym); + #endif /* __ASM_MODULE_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index c4e2f70c0aa0..8d971f9c6ed5 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -30,6 +30,7 @@ arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ ../../arm/kernel/opcodes.o arm64-obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o +arm64-obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o arm64-obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c new file mode 100644 index 000000000000..1ce90d8450ae --- /dev/null +++ b/arch/arm64/kernel/module-plts.c @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2014-2016 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +struct plt_entry { + /* + * A program that conforms to the AArch64 Procedure Call Standard + * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or + * IP1 (x17) may be inserted at any branch instruction that is + * exposed to a relocation that supports long branches. Since that + * is exactly what we are dealing with here, we are free to use x16 + * as a scratch register in the PLT veneers. + */ + __le32 mov0; /* movn x16, #0x.... */ + __le32 mov1; /* movk x16, #0x...., lsl #16 */ + __le32 mov2; /* movk x16, #0x...., lsl #32 */ + __le32 br; /* br x16 */ +}; + +u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, + Elf64_Sym *sym) +{ + struct plt_entry *plt = (struct plt_entry *)mod->arch.plt->sh_addr; + int i = mod->arch.plt_num_entries; + u64 val = sym->st_value + rela->r_addend; + + /* + * We only emit PLT entries against undefined (SHN_UNDEF) symbols, + * which are listed in the ELF symtab section, but without a type + * or a size. + * So, similar to how the module loader uses the Elf64_Sym::st_value + * field to store the resolved addresses of undefined symbols, let's + * borrow the Elf64_Sym::st_size field (whose value is never used by + * the module loader, even for symbols that are defined) to record + * the address of a symbol's associated PLT entry as we emit it for a + * zero addend relocation (which is the only kind we have to deal with + * in practice). This allows us to find duplicates without having to + * go through the table every time. + */ + if (rela->r_addend == 0 && sym->st_size != 0) { + BUG_ON(sym->st_size < (u64)plt || sym->st_size >= (u64)&plt[i]); + return sym->st_size; + } + + mod->arch.plt_num_entries++; + BUG_ON(mod->arch.plt_num_entries > mod->arch.plt_max_entries); + + /* + * MOVK/MOVN/MOVZ opcode: + * +--------+------------+--------+-----------+-------------+---------+ + * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] | + * +--------+------------+--------+-----------+-------------+---------+ + * + * Rd := 0x10 (x16) + * hw := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32) + * opc := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ) + * sf := 1 (64-bit variant) + */ + plt[i] = (struct plt_entry){ + cpu_to_le32(0x92800010 | (((~val ) & 0xffff)) << 5), + cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5), + cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5), + cpu_to_le32(0xd61f0200) + }; + + if (rela->r_addend == 0) + sym->st_size = (u64)&plt[i]; + + return (u64)&plt[i]; +} + +#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) +{ + const Elf64_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(ELF64_R_TYPE(x->r_info), ELF64_R_TYPE(y->r_info)); + if (i == 0) + i = cmp_3way(ELF64_R_SYM(x->r_info), ELF64_R_SYM(y->r_info)); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; +} + +static bool duplicate_rel(const Elf64_Rela *rela, int num) +{ + /* + * Entries are sorted by type, symbol index and addend. That means + * that, if a duplicate entry exists, it must be in the preceding + * slot. + */ + return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0; +} + +static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num) +{ + unsigned int ret = 0; + Elf64_Sym *s; + int i; + + for (i = 0; i < num; i++) { + switch (ELF64_R_TYPE(rela[i].r_info)) { + case R_AARCH64_JUMP26: + case R_AARCH64_CALL26: + /* + * We only have to consider branch targets that resolve + * to undefined symbols. This is not simply a heuristic, + * it is a fundamental limitation, since the PLT itself + * is part of the module, and needs to be within 128 MB + * as well, so modules can never grow beyond that limit. + */ + s = syms + ELF64_R_SYM(rela[i].r_info); + if (s->st_shndx != SHN_UNDEF) + break; + + /* + * Jump relocations with non-zero addends against + * undefined symbols are supported by the ELF spec, but + * do not occur in practice (e.g., 'jump n bytes past + * the entry point of undefined function symbol f'). + * So we need to support them, but there is no need to + * take them into consideration when trying to optimize + * this code. So let's only check for duplicates when + * the addend is zero: this allows us to record the PLT + * entry address in the symbol table itself, rather than + * having to search the list for duplicates each time we + * emit one. + */ + if (rela[i].r_addend != 0 || !duplicate_rel(rela, i)) + ret++; + break; + } + } + return ret; +} + +int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + unsigned long plt_max_entries = 0; + Elf64_Sym *syms = NULL; + int i; + + /* + * Find the empty .plt section so we can expand it to store the PLT + * entries. Record the symtab address as well. + */ + for (i = 0; i < ehdr->e_shnum; i++) { + if (strcmp(".plt", secstrings + sechdrs[i].sh_name) == 0) + mod->arch.plt = sechdrs + i; + else if (sechdrs[i].sh_type == SHT_SYMTAB) + syms = (Elf64_Sym *)sechdrs[i].sh_addr; + } + + if (!mod->arch.plt) { + pr_err("%s: module PLT section missing\n", mod->name); + return -ENOEXEC; + } + if (!syms) { + pr_err("%s: module symtab section missing\n", mod->name); + return -ENOEXEC; + } + + for (i = 0; i < ehdr->e_shnum; i++) { + Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset; + int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); + Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info; + + if (sechdrs[i].sh_type != SHT_RELA) + continue; + + /* ignore relocations that operate on non-exec sections */ + if (!(dstsec->sh_flags & SHF_EXECINSTR)) + continue; + + /* sort by type, symbol index and addend */ + sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL); + + plt_max_entries += count_plts(syms, rels, numrels); + } + + mod->arch.plt->sh_type = SHT_NOBITS; + mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.plt->sh_addralign = L1_CACHE_BYTES; + mod->arch.plt->sh_size = plt_max_entries * sizeof(struct plt_entry); + mod->arch.plt_num_entries = 0; + mod->arch.plt_max_entries = plt_max_entries; + return 0; +} diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 93e970231ca9..a9dde97f5ca5 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -38,6 +38,21 @@ void *module_alloc(unsigned long size) GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); + if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && + !IS_ENABLED(CONFIG_KASAN)) + /* + * KASAN can only deal with module allocations being served + * from the reserved module region, since the remainder of + * the vmalloc region is already backed by zero shadow pages, + * and punching holes into it is non-trivial. Since the module + * region is not randomized when KASAN is enabled, it is even + * less likely that the module region gets exhausted, so we + * can simply omit this fallback in that case. + */ + p = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, + VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_module_alloc(p, size) < 0)) { vfree(p); return NULL; @@ -361,6 +376,13 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_CALL26: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, AARCH64_INSN_IMM_26); + + if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && + ovf == -ERANGE) { + val = module_emit_plt_entry(me, &rel[i], sym); + ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, + 26, AARCH64_INSN_IMM_26); + } break; default: diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds new file mode 100644 index 000000000000..8949f6c6f729 --- /dev/null +++ b/arch/arm64/kernel/module.lds @@ -0,0 +1,3 @@ +SECTIONS { + .plt (NOLOAD) : { BYTE(0) } +} From ee1ef5eefa116dd23b1311c019e6d4d115bb1f4d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 26 Dec 2015 13:48:02 +0100 Subject: [PATCH 0431/3220] UPSTREAM: arm64: avoid R_AARCH64_ABS64 relocations for Image header fields Unfortunately, the current way of using the linker to emit build time constants into the Image header will no longer work once we switch to the use of PIE executables. The reason is that such constants are emitted into the binary using R_AARCH64_ABS64 relocations, which are resolved at runtime, not at build time, and the places targeted by those relocations will contain zeroes before that. So refactor the endian swapping linker script constant generation code so that it emits the upper and lower 32-bit words separately. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 6ad1fe5d9077a1ab40bf74b61994d2e770b00b14) Signed-off-by: Jeff Vander Stoep Change-Id: Iaa809a0b5fcf628e1e49cd6aaa0f31f31ce95c23 --- arch/arm64/include/asm/assembler.h | 11 ++++++++++ arch/arm64/kernel/head.S | 6 +++--- arch/arm64/kernel/image.h | 32 ++++++++++++++++++------------ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index bb7b72734c24..ba5aff6c830e 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -215,4 +215,15 @@ lr .req x30 // link register .size __pi_##x, . - x; \ ENDPROC(x) + /* + * Emit a 64-bit absolute little endian symbol reference in a way that + * ensures that it will be resolved at build time, even when building a + * PIE binary. This requires cooperation from the linker script, which + * must emit the lo32/hi32 halves individually. + */ + .macro le64sym, sym + .long \sym\()_lo32 + .long \sym\()_hi32 + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 05b98289093e..f076debf392d 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -83,9 +83,9 @@ efi_head: b stext // branch to kernel start, magic .long 0 // reserved #endif - .quad _kernel_offset_le // Image load offset from start of RAM, little-endian - .quad _kernel_size_le // Effective size of kernel image, little-endian - .quad _kernel_flags_le // Informative flags, little-endian + le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian + le64sym _kernel_size_le // Effective size of kernel image, little-endian + le64sym _kernel_flags_le // Informative flags, little-endian .quad 0 // reserved .quad 0 // reserved .quad 0 // reserved diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index c9c62cab25a4..db1bf57948f1 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -26,21 +26,27 @@ * There aren't any ELF relocations we can use to endian-swap values known only * at link time (e.g. the subtraction of two symbol addresses), so we must get * the linker to endian-swap certain values before emitting them. + * + * Note that, in order for this to work when building the ELF64 PIE executable + * (for KASLR), these values should not be referenced via R_AARCH64_ABS64 + * relocations, since these are fixed up at runtime rather than at build time + * when PIE is in effect. So we need to split them up in 32-bit high and low + * words. */ #ifdef CONFIG_CPU_BIG_ENDIAN -#define DATA_LE64(data) \ - ((((data) & 0x00000000000000ff) << 56) | \ - (((data) & 0x000000000000ff00) << 40) | \ - (((data) & 0x0000000000ff0000) << 24) | \ - (((data) & 0x00000000ff000000) << 8) | \ - (((data) & 0x000000ff00000000) >> 8) | \ - (((data) & 0x0000ff0000000000) >> 24) | \ - (((data) & 0x00ff000000000000) >> 40) | \ - (((data) & 0xff00000000000000) >> 56)) +#define DATA_LE32(data) \ + ((((data) & 0x000000ff) << 24) | \ + (((data) & 0x0000ff00) << 8) | \ + (((data) & 0x00ff0000) >> 8) | \ + (((data) & 0xff000000) >> 24)) #else -#define DATA_LE64(data) ((data) & 0xffffffffffffffff) +#define DATA_LE32(data) ((data) & 0xffffffff) #endif +#define DEFINE_IMAGE_LE64(sym, data) \ + sym##_lo32 = DATA_LE32((data) & 0xffffffff); \ + sym##_hi32 = DATA_LE32((data) >> 32) + #ifdef CONFIG_CPU_BIG_ENDIAN #define __HEAD_FLAG_BE 1 #else @@ -61,9 +67,9 @@ * endian swapped in head.S, all are done here for consistency. */ #define HEAD_SYMBOLS \ - _kernel_size_le = DATA_LE64(_end - _text); \ - _kernel_offset_le = DATA_LE64(TEXT_OFFSET); \ - _kernel_flags_le = DATA_LE64(__HEAD_FLAGS); + DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text); \ + DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET); \ + DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS); #ifdef CONFIG_EFI From 68c7746e3ca999b2d3c75af8ac714ad28c1a4288 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 26 Dec 2015 12:46:40 +0100 Subject: [PATCH 0432/3220] UPSTREAM: arm64: avoid dynamic relocations in early boot code Before implementing KASLR for arm64 by building a self-relocating PIE executable, we have to ensure that values we use before the relocation routine is executed are not subject to dynamic relocation themselves. This applies not only to virtual addresses, but also to values that are supplied by the linker at build time and relocated using R_AARCH64_ABS64 relocations. So instead, use assemble time constants, or force the use of static relocations by folding the constants into the instructions. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 2bf31a4a05f5b00f37d65ba029d36a0230286cb7) Signed-off-by: Jeff Vander Stoep Change-Id: Icce0176591e3c0ae444e1ea54258efe677933c5b --- arch/arm64/kernel/efi-entry.S | 2 +- arch/arm64/kernel/head.S | 39 +++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index a773db92908b..f82036e02485 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -61,7 +61,7 @@ ENTRY(entry) */ mov x20, x0 // DTB address ldr x0, [sp, #16] // relocated _text address - ldr x21, =stext_offset + movz x21, #:abs_g0:stext_offset add x21, x0, x21 /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index f076debf392d..4cad8f9f2268 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -67,12 +67,11 @@ * in the entry routines. */ __HEAD - +_head: /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ #ifdef CONFIG_EFI -efi_head: /* * This add instruction has no meaningful effect except that * its opcode forms the magic "MZ" signature required by UEFI. @@ -94,14 +93,14 @@ efi_head: .byte 0x4d .byte 0x64 #ifdef CONFIG_EFI - .long pe_header - efi_head // Offset to the PE header. + .long pe_header - _head // Offset to the PE header. #else .word 0 // reserved #endif #ifdef CONFIG_EFI .globl __efistub_stext_offset - .set __efistub_stext_offset, stext - efi_head + .set __efistub_stext_offset, stext - _head .align 3 pe_header: .ascii "PE" @@ -124,7 +123,7 @@ optional_header: .long _end - stext // SizeOfCode .long 0 // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_entry - efi_head // AddressOfEntryPoint + .long __efistub_entry - _head // AddressOfEntryPoint .long __efistub_stext_offset // BaseOfCode extra_header_fields: @@ -139,7 +138,7 @@ extra_header_fields: .short 0 // MinorSubsystemVersion .long 0 // Win32VersionValue - .long _end - efi_head // SizeOfImage + .long _end - _head // SizeOfImage // Everything before the kernel image is considered part of the header .long __efistub_stext_offset // SizeOfHeaders @@ -219,11 +218,13 @@ ENTRY(stext) * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ - ldr x27, =__mmap_switched // address to jump to after + ldr x27, 0f // address to jump to after // MMU has been enabled adr_l lr, __enable_mmu // return (PIC) address b __cpu_setup // initialise processor ENDPROC(stext) + .align 3 +0: .quad __mmap_switched - (_head - TEXT_OFFSET) + KIMAGE_VADDR /* * Preserve the arguments passed by the bootloader in x0 .. x3 @@ -391,7 +392,8 @@ __create_page_tables: mov x0, x26 // swapper_pg_dir ldr x5, =KIMAGE_VADDR create_pgd_entry x0, x5, x3, x6 - ldr x6, =KERNEL_END // __va(KERNEL_END) + ldr w6, kernel_img_size + add x6, x6, x5 mov x3, x24 // phys offset create_block_map x0, x7, x3, x5, x6 @@ -408,6 +410,9 @@ __create_page_tables: mov lr, x27 ret ENDPROC(__create_page_tables) + +kernel_img_size: + .long _end - (_head - TEXT_OFFSET) .ltorg /* @@ -415,6 +420,10 @@ ENDPROC(__create_page_tables) */ .set initial_sp, init_thread_union + THREAD_START_SP __mmap_switched: + adr_l x8, vectors // load VBAR_EL1 with virtual + msr vbar_el1, x8 // vector table address + isb + // Clear BSS adr_l x0, __bss_start mov x1, xzr @@ -610,13 +619,19 @@ ENTRY(secondary_startup) adrp x26, swapper_pg_dir bl __cpu_setup // initialise processor - ldr x21, =secondary_data - ldr x27, =__secondary_switched // address to jump to after enabling the MMU + ldr x8, =KIMAGE_VADDR + ldr w9, 0f + sub x27, x8, w9, sxtw // address to jump to after enabling the MMU b __enable_mmu ENDPROC(secondary_startup) +0: .long (_text - TEXT_OFFSET) - __secondary_switched ENTRY(__secondary_switched) - ldr x0, [x21] // get secondary_data.stack + adr_l x5, vectors + msr vbar_el1, x5 + isb + + ldr_l x0, secondary_data // get secondary_data.stack mov sp, x0 and x0, x0, #~(THREAD_SIZE - 1) msr sp_el0, x0 // save thread_info @@ -641,8 +656,6 @@ __enable_mmu: ubfx x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4 cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED b.ne __no_granule_support - ldr x5, =vectors - msr vbar_el1, x5 msr ttbr0_el1, x25 // load TTBR0 msr ttbr1_el1, x26 // load TTBR1 isb From d78098f62acbb27b7b222b8e3a517581d0842a9f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 17:08:26 +0100 Subject: [PATCH 0433/3220] UPSTREAM: arm64: make asm/elf.h available to asm files This reshuffles some code in asm/elf.h and puts a #ifndef __ASSEMBLY__ around its C definitions so that the CPP defines can be used in asm source files as well. Acked-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 4a2e034e5cdadde4c712f79bdd57d1455c76a3db) Signed-off-by: Jeff Vander Stoep Change-Id: Ic499e950d2ef297d10848862a6dfa07b90887f4c --- arch/arm64/include/asm/elf.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index faad6df49e5b..435f55952e1f 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -24,15 +24,6 @@ #include #include -typedef unsigned long elf_greg_t; - -#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t)) -#define ELF_CORE_COPY_REGS(dest, regs) \ - *(struct user_pt_regs *)&(dest) = (regs)->user_regs; - -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; -typedef struct user_fpsimd_state elf_fpregset_t; - /* * AArch64 static relocation types. */ @@ -127,6 +118,17 @@ typedef struct user_fpsimd_state elf_fpregset_t; */ #define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) +#ifndef __ASSEMBLY__ + +typedef unsigned long elf_greg_t; + +#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t)) +#define ELF_CORE_COPY_REGS(dest, regs) \ + *(struct user_pt_regs *)&(dest) = (regs)->user_regs; + +typedef elf_greg_t elf_gregset_t[ELF_NGREG]; +typedef struct user_fpsimd_state elf_fpregset_t; + /* * When the program starts, a1 contains a pointer to a function to be * registered with atexit, as per the SVR4 ABI. A value of 0 means we have no @@ -186,4 +188,6 @@ extern int aarch32_setup_vectors_page(struct linux_binprm *bprm, #endif /* CONFIG_COMPAT */ +#endif /* !__ASSEMBLY__ */ + #endif From e0357208fbf9e2c9c33af1900c52c2cbaefbe910 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 2 Feb 2016 15:53:59 +0000 Subject: [PATCH 0434/3220] UPSTREAM: arm64: futex.h: Add missing PAN toggling futex.h's futex_atomic_cmpxchg_inatomic() does not use the __futex_atomic_op() macro and needs its own PAN toggling. This was missed when the feature was implemented. Fixes: 338d4f49d6f ("arm64: kernel: Add support for Privileged Access Never") Signed-off-by: James Morse Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 811d61e384e24759372bb3f01772f3744b0a8327) Signed-off-by: Jeff Vander Stoep Change-Id: I6e7b338a1af17b784d4196101422c3acee3b88ed --- arch/arm64/include/asm/futex.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 007a69fc4f40..5f3ab8c1db55 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -121,6 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, return -EFAULT; asm volatile("// futex_atomic_cmpxchg_inatomic\n" +ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" " sub %w3, %w1, %w4\n" @@ -137,6 +138,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, " .align 3\n" " .quad 1b, 4b, 2b, 4b\n" " .popsection\n" +ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) : "memory"); From eebed210c5e62cfe0f9f8de4173099f2d32faae9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 10 Jan 2016 11:42:28 +0100 Subject: [PATCH 0435/3220] UPSTREAM: scripts/sortextable: add support for ET_DYN binaries Add support to scripts/sortextable for handling relocatable (PIE) executables, whose ELF type is ET_DYN, not ET_EXEC. Other than adding support for the new type, no changes are needed. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 7b957b6e603623ef8b2e8222fa94b976df613fa2) Signed-off-by: Jeff Vander Stoep Change-Id: If55296ef4934b99c38ceb5acbd7c4a7fb23f24c1 --- scripts/sortextable.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/sortextable.c b/scripts/sortextable.c index c2423d913b46..ecefa0a634f8 100644 --- a/scripts/sortextable.c +++ b/scripts/sortextable.c @@ -266,9 +266,9 @@ do_file(char const *const fname) break; } /* end switch */ if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0 - || r2(&ehdr->e_type) != ET_EXEC + || (r2(&ehdr->e_type) != ET_EXEC && r2(&ehdr->e_type) != ET_DYN) || ehdr->e_ident[EI_VERSION] != EV_CURRENT) { - fprintf(stderr, "unrecognized ET_EXEC file %s\n", fname); + fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file %s\n", fname); fail_file(); } @@ -304,7 +304,7 @@ do_file(char const *const fname) if (r2(&ehdr->e_ehsize) != sizeof(Elf32_Ehdr) || r2(&ehdr->e_shentsize) != sizeof(Elf32_Shdr)) { fprintf(stderr, - "unrecognized ET_EXEC file: %s\n", fname); + "unrecognized ET_EXEC/ET_DYN file: %s\n", fname); fail_file(); } do32(ehdr, fname, custom_sort); @@ -314,7 +314,7 @@ do_file(char const *const fname) if (r2(&ghdr->e_ehsize) != sizeof(Elf64_Ehdr) || r2(&ghdr->e_shentsize) != sizeof(Elf64_Shdr)) { fprintf(stderr, - "unrecognized ET_EXEC file: %s\n", fname); + "unrecognized ET_EXEC/ET_DYN file: %s\n", fname); fail_file(); } do64(ghdr, fname, custom_sort); From 27375cf76085992a805dc5ad1db397e7227717c9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 1 Jan 2016 12:39:09 +0100 Subject: [PATCH 0436/3220] UPSTREAM: extable: add support for relative extables to search and sort routines This adds support to the generic search_extable() and sort_extable() implementations for dealing with exception table entries whose fields contain relative offsets rather than absolute addresses. Acked-by: Helge Deller Acked-by: Heiko Carstens Acked-by: H. Peter Anvin Acked-by: Tony Luck Acked-by: Will Deacon Acked-by: Richard Henderson Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a272858a3c1ecd4a935ba23c66668f81214bd110) Signed-off-by: Jeff Vander Stoep Change-Id: I9d144d351d547c49bf3203a69dfff3cb71a51177 --- lib/extable.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/lib/extable.c b/lib/extable.c index 4cac81ec225e..0be02ad561e9 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -14,7 +14,37 @@ #include #include +#ifndef ARCH_HAS_RELATIVE_EXTABLE +#define ex_to_insn(x) ((x)->insn) +#else +static inline unsigned long ex_to_insn(const struct exception_table_entry *x) +{ + return (unsigned long)&x->insn + x->insn; +} +#endif + #ifndef ARCH_HAS_SORT_EXTABLE +#ifndef ARCH_HAS_RELATIVE_EXTABLE +#define swap_ex NULL +#else +static void swap_ex(void *a, void *b, int size) +{ + struct exception_table_entry *x = a, *y = b, tmp; + int delta = b - a; + + tmp = *x; + x->insn = y->insn + delta; + y->insn = tmp.insn - delta; + +#ifdef swap_ex_entry_fixup + swap_ex_entry_fixup(x, y, tmp, delta); +#else + x->fixup = y->fixup + delta; + y->fixup = tmp.fixup - delta; +#endif +} +#endif /* ARCH_HAS_RELATIVE_EXTABLE */ + /* * The exception table needs to be sorted so that the binary * search that we use to find entries in it works properly. @@ -26,9 +56,9 @@ static int cmp_ex(const void *a, const void *b) const struct exception_table_entry *x = a, *y = b; /* avoid overflow */ - if (x->insn > y->insn) + if (ex_to_insn(x) > ex_to_insn(y)) return 1; - if (x->insn < y->insn) + if (ex_to_insn(x) < ex_to_insn(y)) return -1; return 0; } @@ -37,7 +67,7 @@ void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish) { sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, NULL); + cmp_ex, swap_ex); } #ifdef CONFIG_MODULES @@ -48,13 +78,15 @@ void sort_extable(struct exception_table_entry *start, void trim_init_extable(struct module *m) { /*trim the beginning*/ - while (m->num_exentries && within_module_init(m->extable[0].insn, m)) { + while (m->num_exentries && + within_module_init(ex_to_insn(&m->extable[0]), m)) { m->extable++; m->num_exentries--; } /*trim the end*/ while (m->num_exentries && - within_module_init(m->extable[m->num_exentries-1].insn, m)) + within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]), + m)) m->num_exentries--; } #endif /* CONFIG_MODULES */ @@ -81,13 +113,13 @@ search_extable(const struct exception_table_entry *first, * careful, the distance between value and insn * can be larger than MAX_LONG: */ - if (mid->insn < value) + if (ex_to_insn(mid) < value) first = mid + 1; - else if (mid->insn > value) + else if (ex_to_insn(mid) > value) last = mid - 1; else return mid; - } - return NULL; + } + return NULL; } #endif From 161994d19e72907ec95e545d1129675c9c966f1a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 1 Jan 2016 15:02:12 +0100 Subject: [PATCH 0437/3220] UPSTREAM: arm64: switch to relative exception tables Instead of using absolute addresses for both the exception location and the fixup, use offsets relative to the exception table entry values. Not only does this cut the size of the exception table in half, it is also a prerequisite for KASLR, since absolute exception table entries are subject to dynamic relocation, which is incompatible with the sorting of the exception table that occurs at build time. This patch also introduces the _ASM_EXTABLE preprocessor macro (which exists on x86 as well) and its _asm_extable assembly counterpart, as shorthands to emit exception table entries. Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 6c94f27ac847ff8ef15b3da5b200574923bd6287) Signed-off-by: Jeff Vander Stoep Change-Id: Icedda8ee8c32843c439765783816d7d71ca0073a --- arch/arm64/include/asm/alternative.h | 19 +++++----------- arch/arm64/include/asm/assembler.h | 15 +++++++++---- arch/arm64/include/asm/futex.h | 12 ++++------ arch/arm64/include/asm/uaccess.h | 30 +++++++++++++------------ arch/arm64/include/asm/word-at-a-time.h | 7 +++--- arch/arm64/kernel/armv8_deprecated.c | 7 ++---- arch/arm64/mm/extable.c | 2 +- scripts/sortextable.c | 2 +- 8 files changed, 43 insertions(+), 51 deletions(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index a9fc24ec1aa9..beccbdefa106 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -157,11 +157,8 @@ void apply_alternatives(void *start, size_t length); add \addr, \addr, \post_inc; alternative_endif - .section __ex_table,"a"; - .align 3; - .quad 8888b,\l; - .quad 8889b,\l; - .previous; + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; .endm .macro uao_stp l, reg1, reg2, addr, post_inc @@ -175,11 +172,8 @@ void apply_alternatives(void *start, size_t length); add \addr, \addr, \post_inc; alternative_endif - .section __ex_table,"a"; - .align 3; - .quad 8888b,\l; - .quad 8889b,\l; - .previous + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; .endm .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc @@ -191,10 +185,7 @@ void apply_alternatives(void *start, size_t length); add \addr, \addr, \post_inc; alternative_endif - .section __ex_table,"a"; - .align 3; - .quad 8888b,\l; - .previous + _asm_extable 8888b,\l; .endm #else .macro uao_ldp l, reg1, reg2, addr, post_inc diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index ba5aff6c830e..70f7b9e04598 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -94,12 +94,19 @@ dmb \opt .endm +/* + * Emit an entry into the exception table + */ + .macro _asm_extable, from, to + .pushsection __ex_table, "a" + .align 3 + .long (\from - .), (\to - .) + .popsection + .endm + #define USER(l, x...) \ 9999: x; \ - .section __ex_table,"a"; \ - .align 3; \ - .quad 9999b,l; \ - .previous + _asm_extable 9999b, l /* * Register aliases. diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 5f3ab8c1db55..f2585cdd32c2 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -42,10 +42,8 @@ "4: mov %w0, %w5\n" \ " b 3b\n" \ " .popsection\n" \ -" .pushsection __ex_table,\"a\"\n" \ -" .align 3\n" \ -" .quad 1b, 4b, 2b, 4b\n" \ -" .popsection\n" \ + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) \ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ @@ -134,10 +132,8 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) "4: mov %w0, %w6\n" " b 3b\n" " .popsection\n" -" .pushsection __ex_table,\"a\"\n" -" .align 3\n" -" .quad 1b, 4b, 2b, 4b\n" -" .popsection\n" + _ASM_EXTABLE(1b, 4b) + _ASM_EXTABLE(2b, 4b) ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 7b07e73b6316..c3d445b42351 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -36,11 +36,11 @@ #define VERIFY_WRITE 1 /* - * The exception table consists of pairs of addresses: the first is the - * address of an instruction that is allowed to fault, and the second is - * the address at which the program should continue. No registers are - * modified, so it is entirely up to the continuation code to figure out - * what to do. + * The exception table consists of pairs of relative offsets: the first + * is the relative offset to an instruction that is allowed to fault, + * and the second is the relative offset at which the program should + * continue. No registers are modified, so it is entirely up to the + * continuation code to figure out what to do. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -50,9 +50,11 @@ struct exception_table_entry { - unsigned long insn, fixup; + int insn, fixup; }; +#define ARCH_HAS_RELATIVE_EXTABLE + extern int fixup_exception(struct pt_regs *regs); #define KERNEL_DS (-1UL) @@ -115,6 +117,12 @@ static inline void set_fs(mm_segment_t fs) #define access_ok(type, addr, size) __range_ok(addr, size) #define user_addr_max get_fs +#define _ASM_EXTABLE(from, to) \ + " .pushsection __ex_table, \"a\"\n" \ + " .align 3\n" \ + " .long (" #from " - .), (" #to " - .)\n" \ + " .popsection\n" + /* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" @@ -134,10 +142,7 @@ static inline void set_fs(mm_segment_t fs) " mov %1, #0\n" \ " b 2b\n" \ " .previous\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .quad 1b, 3b\n" \ - " .previous" \ + _ASM_EXTABLE(1b, 3b) \ : "+r" (err), "=&r" (x) \ : "r" (addr), "i" (-EFAULT)) @@ -206,10 +211,7 @@ do { \ "3: mov %w0, %3\n" \ " b 2b\n" \ " .previous\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .quad 1b, 3b\n" \ - " .previous" \ + _ASM_EXTABLE(1b, 3b) \ : "+r" (err) \ : "r" (x), "r" (addr), "i" (-EFAULT)) diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h index aab5bf09e9d9..2b79b8a89457 100644 --- a/arch/arm64/include/asm/word-at-a-time.h +++ b/arch/arm64/include/asm/word-at-a-time.h @@ -16,6 +16,8 @@ #ifndef __ASM_WORD_AT_A_TIME_H #define __ASM_WORD_AT_A_TIME_H +#include + #ifndef __AARCH64EB__ #include @@ -81,10 +83,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr) #endif " b 2b\n" " .popsection\n" - " .pushsection __ex_table,\"a\"\n" - " .align 3\n" - " .quad 1b, 3b\n" - " .popsection" + _ASM_EXTABLE(1b, 3b) : "=&r" (ret), "=&r" (offset) : "r" (addr), "Q" (*(unsigned long *)addr)); diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 3e01207917b1..c37202c0c838 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -297,11 +297,8 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) "4: mov %w0, %w5\n" \ " b 3b\n" \ " .popsection" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .quad 0b, 4b\n" \ - " .quad 1b, 4b\n" \ - " .popsection\n" \ + _ASM_EXTABLE(0b, 4b) \ + _ASM_EXTABLE(1b, 4b) \ ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp) \ diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index 79444279ba8c..81acd4706878 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(instruction_pointer(regs)); if (fixup) - regs->pc = fixup->fixup; + regs->pc = (unsigned long)&fixup->fixup + fixup->fixup; return fixup != NULL; } diff --git a/scripts/sortextable.c b/scripts/sortextable.c index ecefa0a634f8..19d83647846c 100644 --- a/scripts/sortextable.c +++ b/scripts/sortextable.c @@ -282,12 +282,12 @@ do_file(char const *const fname) case EM_386: case EM_X86_64: case EM_S390: + case EM_AARCH64: custom_sort = sort_relative_table; break; case EM_ARCOMPACT: case EM_ARCV2: case EM_ARM: - case EM_AARCH64: case EM_MICROBLAZE: case EM_MIPS: case EM_XTENSA: From 01684359230797235a0bdbd9768241c41f990105 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 26 Jan 2016 09:13:44 +0100 Subject: [PATCH 0438/3220] UPSTREAM: arm64: add support for building vmlinux as a relocatable PIE binary This implements CONFIG_RELOCATABLE, which links the final vmlinux image with a dynamic relocation section, allowing the early boot code to perform a relocation to a different virtual address at runtime. This is a prerequisite for KASLR (CONFIG_RANDOMIZE_BASE). Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 1e48ef7fcc374051730381a2a05da77eb4eafdb0) Signed-off-by: Jeff Vander Stoep Change-Id: If02e065722d438f85feb62240fc230e16f58e912 --- arch/arm64/Kconfig | 11 +++++++++++ arch/arm64/Makefile | 4 ++++ arch/arm64/include/asm/elf.h | 2 ++ arch/arm64/kernel/head.S | 32 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/vmlinux.lds.S | 16 ++++++++++++++++ 5 files changed, 65 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 65a8e7c9fdd1..1faf7905f0fa 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -771,6 +771,17 @@ config ARM64_MODULE_PLTS select ARM64_MODULE_CMODEL_LARGE select HAVE_MOD_ARCH_SPECIFIC +config RELOCATABLE + bool + help + This builds the kernel as a Position Independent Executable (PIE), + which retains all relocation metadata required to relocate the + kernel binary at runtime to a different virtual address than the + address it was linked at. + Since AArch64 uses the RELA relocation format, this requires a + relocation pass at runtime even if the kernel is loaded at the + same address it was linked at. + endmenu menu "Boot options" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 763509ff6f24..47f61e9b8fec 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -15,6 +15,10 @@ CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET) OBJCOPYFLAGS :=-O binary -R .note -R .note.gnu.build-id -R .comment -S GZFLAGS :=-9 +ifneq ($(CONFIG_RELOCATABLE),) +LDFLAGS_vmlinux += -pie +endif + KBUILD_DEFCONFIG := defconfig # Check for binutils support for specific extensions diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 435f55952e1f..24ed037f09fd 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -77,6 +77,8 @@ #define R_AARCH64_MOVW_PREL_G2_NC 292 #define R_AARCH64_MOVW_PREL_G3 293 +#define R_AARCH64_RELATIVE 1027 + /* * These are used to set parameters in the core dumps. */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 4cad8f9f2268..4e69412a7323 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -432,6 +433,37 @@ __mmap_switched: bl __pi_memset dsb ishst // Make zero page visible to PTW +#ifdef CONFIG_RELOCATABLE + + /* + * Iterate over each entry in the relocation table, and apply the + * relocations in place. + */ + adr_l x8, __dynsym_start // start of symbol table + adr_l x9, __reloc_start // start of reloc table + adr_l x10, __reloc_end // end of reloc table + +0: cmp x9, x10 + b.hs 2f + ldp x11, x12, [x9], #24 + ldr x13, [x9, #-8] + cmp w12, #R_AARCH64_RELATIVE + b.ne 1f + str x13, [x11] + b 0b + +1: cmp w12, #R_AARCH64_ABS64 + b.ne 0b + add x12, x12, x12, lsl #1 // symtab offset: 24x top word + add x12, x8, x12, lsr #(32 - 3) // ... shifted into bottom word + ldr x15, [x12, #8] // Elf64_Sym::st_value + add x15, x13, x15 + str x15, [x11] + b 0b + +2: +#endif + adr_l sp, initial_sp, x4 mov x4, sp and x4, x4, #~(THREAD_SIZE - 1) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 7703feba23d6..14813a6ca13b 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -87,6 +87,7 @@ SECTIONS EXIT_CALL *(.discard) *(.discard.*) + *(.interp .dynamic) } . = KIMAGE_VADDR + TEXT_OFFSET; @@ -151,6 +152,21 @@ SECTIONS .altinstr_replacement : { *(.altinstr_replacement) } + .rela : ALIGN(8) { + __reloc_start = .; + *(.rela .rela*) + __reloc_end = .; + } + .dynsym : ALIGN(8) { + __dynsym_start = .; + *(.dynsym) + } + .dynstr : { + *(.dynstr) + } + .hash : { + *(.hash) + } . = ALIGN(PAGE_SIZE); __init_end = .; From a91a00b66c794c81d7afd04f9d2a5385952a24e1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 26 Jan 2016 14:12:01 +0100 Subject: [PATCH 0439/3220] BACKPORT: arm64: add support for kernel ASLR This adds support for KASLR is implemented, based on entropy provided by the bootloader in the /chosen/kaslr-seed DT property. Depending on the size of the address space (VA_BITS) and the page size, the entropy in the virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all 4 levels), with the sidenote that displacements that result in the kernel image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB granule kernels, respectively) are not allowed, and will be rounded up to an acceptable value. If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is randomized independently from the core kernel. This makes it less likely that the location of core kernel data structures can be determined by an adversary, but causes all function calls from modules into the core kernel to be resolved via entries in the module PLTs. If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is randomized by choosing a page aligned 128 MB region inside the interval [_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of entropy (depending on page size), independently of the kernel randomization, but still guarantees that modules are within the range of relative branch and jump instructions (with the caveat that, since the module region is shared with other uses of the vmalloc area, modules may need to be loaded further away if the module region is exhausted) Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit f80fb3a3d50843a401dac4b566b3b131da8077a2) Signed-off-by: Jeff Vander Stoep Change-Id: I3f5fafa4e92e5ff39259d57065541366237eb021 --- arch/arm64/Kconfig | 29 ++++++ arch/arm64/include/asm/memory.h | 5 +- arch/arm64/include/asm/module.h | 6 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/head.S | 59 +++++++++-- arch/arm64/kernel/kaslr.c | 173 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/module.c | 3 +- arch/arm64/kernel/setup.c | 29 ++++++ arch/arm64/mm/kasan_init.c | 17 +++- arch/arm64/mm/mmu.c | 29 ++++-- 10 files changed, 329 insertions(+), 22 deletions(-) create mode 100644 arch/arm64/kernel/kaslr.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1faf7905f0fa..a42e3235484b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -782,6 +782,35 @@ config RELOCATABLE relocation pass at runtime even if the kernel is loaded at the same address it was linked at. +config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + select ARM64_MODULE_PLTS + select RELOCATABLE + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts + relying on knowledge of the location of kernel internals. + + It is the bootloader's job to provide entropy, by passing a + random u64 value in /chosen/kaslr-seed at kernel entry. + + If unsure, say N. + +config RANDOMIZE_MODULE_REGION_FULL + bool "Randomize the module region independently from the core kernel" + depends on RANDOMIZE_BASE + default y + help + Randomizes the location of the module region without considering the + location of the core kernel. This way, it is impossible for modules + to leak information about the location of core kernel data structures + but it does imply that function calls between modules and the core + kernel will need to be resolved via veneers in the module PLT. + + When this option is not set, the module region will be randomized over + a limited range that contains the [_stext, _etext] interval of the + core kernel, so branch relocations are always in range. + endmenu menu "Boot options" diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index eb798156cf56..5f8667a99e41 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -53,7 +53,7 @@ #define KIMAGE_VADDR (MODULES_END) #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) #define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE) -#define MODULES_VSIZE (SZ_64M) +#define MODULES_VSIZE (SZ_128M) #define PCI_IO_END (PAGE_OFFSET - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) #define FIXADDR_TOP (PCI_IO_START - SZ_2M) @@ -139,6 +139,9 @@ extern phys_addr_t memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) +/* the virtual base of the kernel image (minus TEXT_OFFSET) */ +extern u64 kimage_vaddr; + /* the offset between the kernel virtual and physical mappings */ extern u64 kimage_voffset; diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index 8652fb613304..e12af6754634 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -31,4 +31,10 @@ struct mod_arch_specific { u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, Elf64_Sym *sym); +#ifdef CONFIG_RANDOMIZE_BASE +extern u64 module_alloc_base; +#else +#define module_alloc_base ((u64)_etext - MODULES_VSIZE) +#endif + #endif /* __ASM_MODULE_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 8d971f9c6ed5..49a2430b0786 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -43,6 +43,7 @@ arm64-obj-$(CONFIG_PCI) += pci.o arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o arm64-obj-$(CONFIG_ACPI) += acpi.o arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o +arm64-obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 4e69412a7323..319f896c6e74 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -210,6 +210,7 @@ section_table: ENTRY(stext) bl preserve_boot_args bl el2_setup // Drop to EL1, w20=cpu_boot_mode + mov x23, xzr // KASLR offset, defaults to 0 adrp x24, __PHYS_OFFSET bl set_cpu_boot_mode_flag bl __create_page_tables // x25=TTBR0, x26=TTBR1 @@ -313,7 +314,7 @@ ENDPROC(preserve_boot_args) __create_page_tables: adrp x25, idmap_pg_dir adrp x26, swapper_pg_dir - mov x27, lr + mov x28, lr /* * Invalidate the idmap and swapper page tables to avoid potential @@ -392,6 +393,7 @@ __create_page_tables: */ mov x0, x26 // swapper_pg_dir ldr x5, =KIMAGE_VADDR + add x5, x5, x23 // add KASLR displacement create_pgd_entry x0, x5, x3, x6 ldr w6, kernel_img_size add x6, x6, x5 @@ -408,8 +410,7 @@ __create_page_tables: dmb sy bl __inval_cache_range - mov lr, x27 - ret + ret x28 ENDPROC(__create_page_tables) kernel_img_size: @@ -421,6 +422,7 @@ kernel_img_size: */ .set initial_sp, init_thread_union + THREAD_START_SP __mmap_switched: + mov x28, lr // preserve LR adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb @@ -449,19 +451,26 @@ __mmap_switched: ldr x13, [x9, #-8] cmp w12, #R_AARCH64_RELATIVE b.ne 1f - str x13, [x11] + add x13, x13, x23 // relocate + str x13, [x11, x23] b 0b 1: cmp w12, #R_AARCH64_ABS64 b.ne 0b add x12, x12, x12, lsl #1 // symtab offset: 24x top word add x12, x8, x12, lsr #(32 - 3) // ... shifted into bottom word + ldrsh w14, [x12, #6] // Elf64_Sym::st_shndx ldr x15, [x12, #8] // Elf64_Sym::st_value + cmp w14, #-0xf // SHN_ABS (0xfff1) ? + add x14, x15, x23 // relocate + csel x15, x14, x15, ne add x15, x13, x15 - str x15, [x11] + str x15, [x11, x23] b 0b -2: +2: adr_l x8, kimage_vaddr // make relocated kimage_vaddr + dc cvac, x8 // value visible to secondaries + dsb sy // with MMU off #endif adr_l sp, initial_sp, x4 @@ -470,13 +479,23 @@ __mmap_switched: msr sp_el0, x4 // Save thread_info str_l x21, __fdt_pointer, x5 // Save FDT pointer - ldr x4, =KIMAGE_VADDR // Save the offset between + ldr_l x4, kimage_vaddr // Save the offset between sub x4, x4, x24 // the kernel virtual and str_l x4, kimage_voffset, x5 // physical mappings mov x29, #0 #ifdef CONFIG_KASAN bl kasan_early_init +#endif +#ifdef CONFIG_RANDOMIZE_BASE + cbnz x23, 0f // already running randomized? + mov x0, x21 // pass FDT address in x0 + bl kaslr_early_init // parse FDT for KASLR options + cbz x0, 0f // KASLR disabled? just proceed + mov x23, x0 // record KASLR offset + ret x28 // we must enable KASLR, return + // to __enable_mmu() +0: #endif b start_kernel ENDPROC(__mmap_switched) @@ -486,6 +505,10 @@ ENDPROC(__mmap_switched) * hotplug and needs to have the same protections as the text region */ .section ".text","ax" + +ENTRY(kimage_vaddr) + .quad _text - TEXT_OFFSET + /* * If we're fortunate enough to boot at EL2, ensure that the world is * sane before dropping to EL1. @@ -651,7 +674,7 @@ ENTRY(secondary_startup) adrp x26, swapper_pg_dir bl __cpu_setup // initialise processor - ldr x8, =KIMAGE_VADDR + ldr x8, kimage_vaddr ldr w9, 0f sub x27, x8, w9, sxtw // address to jump to after enabling the MMU b __enable_mmu @@ -684,6 +707,7 @@ ENDPROC(__secondary_switched) */ .section ".idmap.text", "ax" __enable_mmu: + mrs x18, sctlr_el1 // preserve old SCTLR_EL1 value mrs x1, ID_AA64MMFR0_EL1 ubfx x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4 cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED @@ -701,6 +725,25 @@ __enable_mmu: ic iallu dsb nsh isb +#ifdef CONFIG_RANDOMIZE_BASE + mov x19, x0 // preserve new SCTLR_EL1 value + blr x27 + + /* + * If we return here, we have a KASLR displacement in x23 which we need + * to take into account by discarding the current kernel mapping and + * creating a new one. + */ + msr sctlr_el1, x18 // disable the MMU + isb + bl __create_page_tables // recreate kernel mapping + + msr sctlr_el1, x19 // re-enable the MMU + isb + ic ialluis // flush instructions fetched + isb // via old mapping + add x27, x27, x23 // relocated __mmap_switched +#endif br x27 ENDPROC(__enable_mmu) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c new file mode 100644 index 000000000000..8b32a1f8f09f --- /dev/null +++ b/arch/arm64/kernel/kaslr.c @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2016 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +u64 __read_mostly module_alloc_base; + +static __init u64 get_kaslr_seed(void *fdt) +{ + int node, len; + u64 *prop; + u64 ret; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +static __init const u8 *get_cmdline(void *fdt) +{ + static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE; + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { + int node; + const u8 *prop; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + goto out; + + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + goto out; + return prop; + } +out: + return default_cmdline; +} + +extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, + pgprot_t prot); + +/* + * This routine will be executed with the kernel mapped at its default virtual + * address, and if it returns successfully, the kernel will be remapped, and + * start_kernel() will be executed from a randomized virtual offset. The + * relocation will result in all absolute references (e.g., static variables + * containing function pointers) to be reinitialized, and zero-initialized + * .bss variables will be reset to 0. + */ +u64 __init kaslr_early_init(u64 dt_phys) +{ + void *fdt; + u64 seed, offset, mask, module_range; + const u8 *cmdline, *str; + int size; + + /* + * Set a reasonable default for module_alloc_base in case + * we end up running with module randomization disabled. + */ + module_alloc_base = (u64)_etext - MODULES_VSIZE; + + /* + * Try to map the FDT early. If this fails, we simply bail, + * and proceed with KASLR disabled. We will make another + * attempt at mapping the FDT in setup_machine() + */ + early_fixmap_init(); + fdt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); + if (!fdt) + return 0; + + /* + * Retrieve (and wipe) the seed from the FDT + */ + seed = get_kaslr_seed(fdt); + if (!seed) + return 0; + + /* + * Check if 'nokaslr' appears on the command line, and + * return 0 if that is the case. + */ + cmdline = get_cmdline(fdt); + str = strstr(cmdline, "nokaslr"); + if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) + return 0; + + /* + * OK, so we are proceeding with KASLR enabled. Calculate a suitable + * kernel image offset from the seed. Let's place the kernel in the + * lower half of the VMALLOC area (VA_BITS - 2). + * Even if we could randomize at page granularity for 16k and 64k pages, + * let's always round to 2 MB so we don't interfere with the ability to + * map using contiguous PTEs + */ + mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1); + offset = seed & mask; + + /* + * The kernel Image should not extend across a 1GB/32MB/512MB alignment + * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this + * happens, increase the KASLR offset by the size of the kernel image. + */ + if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) != + (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT)) + offset = (offset + (u64)(_end - _text)) & mask; + + if (IS_ENABLED(CONFIG_KASAN)) + /* + * KASAN does not expect the module region to intersect the + * vmalloc region, since shadow memory is allocated for each + * module at load time, whereas the vmalloc region is shadowed + * by KASAN zero pages. So keep modules out of the vmalloc + * region if KASAN is enabled. + */ + return offset; + + if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { + /* + * Randomize the module region independently from the core + * kernel. This prevents modules from leaking any information + * about the address of the kernel itself, but results in + * branches between modules and the core kernel that are + * resolved via PLTs. (Branches between modules will be + * resolved normally.) + */ + module_range = VMALLOC_END - VMALLOC_START - MODULES_VSIZE; + module_alloc_base = VMALLOC_START; + } else { + /* + * Randomize the module region by setting module_alloc_base to + * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE, + * _stext) . This guarantees that the resulting region still + * covers [_stext, _etext], and that all relative branches can + * be resolved without veneers. + */ + module_range = MODULES_VSIZE - (u64)(_etext - _stext); + module_alloc_base = (u64)_etext + offset - MODULES_VSIZE; + } + + /* use the lower 21 bits to randomize the base of the module region */ + module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21; + module_alloc_base &= PAGE_MASK; + + return offset; +} diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index a9dde97f5ca5..7f316982ce00 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -34,7 +34,8 @@ void *module_alloc(unsigned long size) { void *p; - p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, + module_alloc_base + MODULES_VSIZE, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 079db3ba21c4..29b8c247d56f 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -388,3 +388,32 @@ static int __init topology_init(void) return 0; } subsys_initcall(topology_init); + +/* + * Dump out kernel offset information on panic. + */ +static int dump_kernel_offset(struct notifier_block *self, unsigned long v, + void *p) +{ + u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR; + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) { + pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n", + kaslr_offset, KIMAGE_VADDR); + } else { + pr_emerg("Kernel Offset: disabled\n"); + } + return 0; +} + +static struct notifier_block kernel_offset_notifier = { + .notifier_call = dump_kernel_offset +}; + +static int __init register_kernel_offset_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &kernel_offset_notifier); + return 0; +} +__initcall(register_kernel_offset_dumper); diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 7f10cc91fa8a..56e19d150c21 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -129,12 +129,16 @@ static void __init clear_pgds(unsigned long start, void __init kasan_init(void) { u64 kimg_shadow_start, kimg_shadow_end; + u64 mod_shadow_start, mod_shadow_end; struct memblock_region *reg; int i; kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); + mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); + mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END); + /* * We are going to perform proper setup of shadow memory. * At first we should unmap early shadow (clear_pgds() call bellow). @@ -158,13 +162,20 @@ void __init kasan_init(void) * with PMD table mappings at the edges of the shadow region for the * kernel image. */ - if (ARM64_SWAPPER_USES_SECTION_MAPS) + if (ARM64_SWAPPER_USES_SECTION_MAPS) { + kimg_shadow_start = round_down(kimg_shadow_start, + SWAPPER_BLOCK_SIZE); kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); + } kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, - kasan_mem_to_shadow((void *)MODULES_VADDR)); + (void *)mod_shadow_start); kasan_populate_zero_shadow((void *)kimg_shadow_end, - kasan_mem_to_shadow((void *)PAGE_OFFSET)); + kasan_mem_to_shadow((void *)PAGE_OFFSET)); + + if (kimg_shadow_start > mod_shadow_end) + kasan_populate_zero_shadow((void *)mod_shadow_end, + (void *)kimg_shadow_start); for_each_memblock(memory, reg) { void *start = (void *)__phys_to_virt(reg->base); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8577437b6c0a..f4d1da0c1531 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -676,7 +676,8 @@ void __init early_fixmap_init(void) unsigned long addr = FIXADDR_START; pgd = pgd_offset_k(addr); - if (CONFIG_PGTABLE_LEVELS > 3 && !pgd_none(*pgd)) { + if (CONFIG_PGTABLE_LEVELS > 3 && + !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa(bm_pud))) { /* * We only end up here if the kernel mapping and the fixmap * share the top level pgd entry, which should only happen on @@ -733,11 +734,10 @@ void __set_fixmap(enum fixed_addresses idx, } } -void *__init fixmap_remap_fdt(phys_addr_t dt_phys) +void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) { const u64 dt_virt_base = __fix_to_virt(FIX_FDT); - pgprot_t prot = PAGE_KERNEL_RO; - int size, offset; + int offset; void *dt_virt; /* @@ -776,16 +776,27 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) if (fdt_check_header(dt_virt) != 0) return NULL; - size = fdt_totalsize(dt_virt); - if (size > MAX_FDT_SIZE) + *size = fdt_totalsize(dt_virt); + if (*size > MAX_FDT_SIZE) return NULL; - if (offset + size > SWAPPER_BLOCK_SIZE) + if (offset + *size > SWAPPER_BLOCK_SIZE) create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, - round_up(offset + size, SWAPPER_BLOCK_SIZE), prot); + round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot); + + return dt_virt; +} + +void *__init fixmap_remap_fdt(phys_addr_t dt_phys) +{ + void *dt_virt; + int size; + + dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO); + if (!dt_virt) + return NULL; memblock_reserve(dt_phys, size); - return dt_virt; } From 09cbceb939d85d7d25b8e424445a2f83cac97c9d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Feb 2016 17:57:13 +0100 Subject: [PATCH 0440/3220] UPSTREAM: arm64: vmemmap: use virtual projection of linear region Commit dd006da21646 ("arm64: mm: increase VA range of identity map") made some changes to the memory mapping code to allow physical memory to reside at an offset that exceeds the size of the virtual mapping. However, since the size of the vmemmap area is proportional to the size of the VA area, but it is populated relative to the physical space, we may end up with the struct page array being mapped outside of the vmemmap region. For instance, on my Seattle A0 box, I can see the following output in the dmesg log. vmemmap : 0xffffffbdc0000000 - 0xffffffbfc0000000 ( 8 GB maximum) 0xffffffbfc0000000 - 0xffffffbfd0000000 ( 256 MB actual) We can fix this by deciding that the vmemmap region is not a projection of the physical space, but of the virtual space above PAGE_OFFSET, i.e., the linear region. This way, we are guaranteed that the vmemmap region is of sufficient size, and we can even reduce the size by half. Cc: Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit dfd55ad85e4a7fbaa82df12467515ac3c81e8a3e) Signed-off-by: Jeff Vander Stoep Change-Id: I8112d910f9659941dab6de5b3791f395150c77f1 --- arch/arm64/include/asm/pgtable.h | 7 ++++--- arch/arm64/mm/init.c | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9bf72b72b76d..d48ea8fc789d 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -34,18 +34,19 @@ /* * VMALLOC and SPARSEMEM_VMEMMAP ranges. * - * VMEMAP_SIZE: allows the whole VA space to be covered by a struct page array + * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array * (rounded up to PUD_SIZE). * VMALLOC_START: beginning of the kernel vmalloc space * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space, * fixed mappings and modules */ -#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) +#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE) #define VMALLOC_START (MODULES_END) #define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) -#define vmemmap ((struct page *)(VMALLOC_END + SZ_64K)) +#define VMEMMAP_START (VMALLOC_END + SZ_64K) +#define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) #define FIRST_USER_ADDRESS 0UL diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index e333f7f0b4f2..04c7e8ba47ce 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -365,8 +365,8 @@ void __init mem_init(void) MLK_ROUNDUP(_text, _etext), MLK_ROUNDUP(_sdata, _edata), #ifdef CONFIG_SPARSEMEM_VMEMMAP - MLG((unsigned long)vmemmap, - (unsigned long)vmemmap + VMEMMAP_SIZE), + MLG(VMEMMAP_START, + VMEMMAP_START + VMEMMAP_SIZE), MLM((unsigned long)virt_to_page(PAGE_OFFSET), (unsigned long)virt_to_page(high_memory)), #endif From ee97096e89bbb5f39614638bd32fded97eb08691 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Feb 2016 17:57:14 +0100 Subject: [PATCH 0441/3220] UPSTREAM: arm64: mm: treat memstart_addr as a signed quantity Commit c031a4213c11 ("arm64: kaslr: randomize the linear region") implements randomization of the linear region, by subtracting a random multiple of PUD_SIZE from memstart_addr. This causes the virtual mapping of system RAM to move upwards in the linear region, and at the same time causes memstart_addr to assume a value which may be negative if the offset of system RAM in the physical space is smaller than its offset relative to PAGE_OFFSET in the virtual space. Since memstart_addr is effectively an offset now, redefine its type as s64 so that expressions involving shifting or division preserve its sign. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 020d044f66874eba058ce8264fc550f3eca67879) Signed-off-by: Jeff Vander Stoep Change-Id: I0482ebc13baaa9005cf372795e656c2417be9d1c --- arch/arm64/include/asm/memory.h | 2 +- arch/arm64/mm/init.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 5f8667a99e41..12f8a00fb3f1 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -135,7 +135,7 @@ #include #include -extern phys_addr_t memstart_addr; +extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 04c7e8ba47ce..3d75ff40c5ff 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -54,7 +54,7 @@ * executes, which assigns it its actual value. So use a default value * that cannot be mistaken for a real physical address. */ -phys_addr_t memstart_addr __read_mostly = ~0ULL; +s64 memstart_addr __read_mostly = -1; phys_addr_t arm64_dma_phys_limit __read_mostly; #ifdef CONFIG_BLK_DEV_INITRD @@ -183,7 +183,7 @@ void __init arm64_memblock_init(void) * linear mapping. Take care not to clip the kernel which may be * high in memory. */ - memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)), + memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)), ULLONG_MAX); if (memblock_end_of_DRAM() > linear_region_size) memblock_remove(0, memblock_end_of_DRAM() - linear_region_size); From 22346f1e99e4ed88ed8f68d95875a92395c3fea4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 29 Jan 2016 11:59:03 +0100 Subject: [PATCH 0442/3220] BACKPORT: arm64: kaslr: randomize the linear region When KASLR is enabled (CONFIG_RANDOMIZE_BASE=y), and entropy has been provided by the bootloader, randomize the placement of RAM inside the linear region if sufficient space is available. For instance, on a 4KB granule/3 levels kernel, the linear region is 256 GB in size, and we can choose any 1 GB aligned offset that is far enough from the top of the address space to fit the distance between the start of the lowest memblock and the top of the highest memblock. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 031a4213c11a5db475f528c182f7b3858df11db) Signed-off-by: Jeff Vander Stoep Change-Id: I272de8ee358351d95eacc7dc5f47600adec3e813 --- arch/arm64/kernel/kaslr.c | 4 ++++ arch/arm64/mm/init.c | 22 ++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 8b32a1f8f09f..582983920054 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -21,6 +21,7 @@ #include u64 __read_mostly module_alloc_base; +u16 __initdata memstart_offset_seed; static __init u64 get_kaslr_seed(void *fdt) { @@ -123,6 +124,9 @@ u64 __init kaslr_early_init(u64 dt_phys) mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1); offset = seed & mask; + /* use the top 16 bits to randomize the linear region */ + memstart_offset_seed = seed >> 48; + /* * The kernel Image should not extend across a 1GB/32MB/512MB alignment * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 3d75ff40c5ff..8e3cf19e4f00 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -198,6 +198,23 @@ void __init arm64_memblock_init(void) memblock_add(__pa(_text), (u64)(_end - _text)); } + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + extern u16 memstart_offset_seed; + u64 range = linear_region_size - + (memblock_end_of_DRAM() - memblock_start_of_DRAM()); + + /* + * If the size of the linear region exceeds, by a sufficient + * margin, the size of the region that the available physical + * memory spans, randomize the linear region as well. + */ + if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) { + range = range / ARM64_MEMSTART_ALIGN + 1; + memstart_addr -= ARM64_MEMSTART_ALIGN * + ((range * memstart_offset_seed) >> 16); + } + } + /* * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. @@ -367,12 +384,13 @@ void __init mem_init(void) #ifdef CONFIG_SPARSEMEM_VMEMMAP MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE), - MLM((unsigned long)virt_to_page(PAGE_OFFSET), + MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()), (unsigned long)virt_to_page(high_memory)), #endif MLK(FIXADDR_START, FIXADDR_TOP), MLM(PCI_IO_START, PCI_IO_END), - MLM(PAGE_OFFSET, (unsigned long)high_memory)); + MLM(__phys_to_virt(memblock_start_of_DRAM()), + (unsigned long)high_memory)); #undef MLK #undef MLM From 39b0815e4872fe6eb20a7d1b42b551e341cb7167 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 10 Jan 2016 11:29:07 +0100 Subject: [PATCH 0443/3220] BACKPORT: efi: stub: implement efi_get_random_bytes() based on EFI_RNG_PROTOCOL This exposes the firmware's implementation of EFI_RNG_PROTOCOL via a new function efi_get_random_bytes(). Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit e4fbf4767440472f9d23b0f25a2b905e1c63b6a8) Signed-off-by: Jeff Vander Stoep Change-Id: Id46036b78c2efd223b6cd5488e512fd93e8f597d --- drivers/firmware/efi/libstub/Makefile | 2 +- drivers/firmware/efi/libstub/efistub.h | 3 +++ drivers/firmware/efi/libstub/random.c | 35 ++++++++++++++++++++++++++ include/linux/efi.h | 6 ++++- 4 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 drivers/firmware/efi/libstub/random.c diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index c0ddd1b8dca3..68bdd9f23e13 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -34,7 +34,7 @@ $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE lib-$(CONFIG_EFI_ARMSTUB) += arm-stub.o fdt.o string.o \ $(patsubst %.c,lib-%.o,$(arm-deps)) -lib-$(CONFIG_ARM64) += arm64-stub.o +lib-$(CONFIG_ARM64) += arm64-stub.o random.o CFLAGS_arm64-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) # diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 6b6548fda089..206b7252b9d1 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -43,4 +43,7 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size, unsigned long desc_size, efi_memory_desc_t *runtime_map, int *count); +efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table, + unsigned long size, u8 *out); + #endif diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c new file mode 100644 index 000000000000..97941ee5954f --- /dev/null +++ b/drivers/firmware/efi/libstub/random.c @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2016 Linaro Ltd; + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include + +#include "efistub.h" + +struct efi_rng_protocol { + efi_status_t (*get_info)(struct efi_rng_protocol *, + unsigned long *, efi_guid_t *); + efi_status_t (*get_rng)(struct efi_rng_protocol *, + efi_guid_t *, unsigned long, u8 *out); +}; + +efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table_arg, + unsigned long size, u8 *out) +{ + efi_guid_t rng_proto = EFI_RNG_PROTOCOL_GUID; + efi_status_t status; + struct efi_rng_protocol *rng; + + status = efi_call_early(locate_protocol, &rng_proto, NULL, + (void **)&rng); + if (status != EFI_SUCCESS) + return status; + + return rng->get_rng(rng, NULL, size, out); +} diff --git a/include/linux/efi.h b/include/linux/efi.h index 569b5a866bb1..e747eb08b2be 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -299,7 +299,7 @@ typedef struct { void *open_protocol_information; void *protocols_per_handle; void *locate_handle_buffer; - void *locate_protocol; + efi_status_t (*locate_protocol)(efi_guid_t *, void *, void **); void *install_multiple_protocol_interfaces; void *uninstall_multiple_protocol_interfaces; void *calculate_crc32; @@ -599,6 +599,10 @@ void efi_native_runtime_setup(void); #define EFI_PROPERTIES_TABLE_GUID \ EFI_GUID( 0x880aaca3, 0x4adc, 0x4a04, 0x90, 0x79, 0xb7, 0x47, 0x34, 0x08, 0x25, 0xe5 ) +#define EFI_RNG_PROTOCOL_GUID \ + EFI_GUID(0x3152bca5, 0xeade, 0x433d, \ + 0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44) + typedef struct { efi_guid_t guid; u64 table; From 718645ea36329edb7d1c01871c13385fe1a2f03a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 10:43:16 +0100 Subject: [PATCH 0444/3220] UPSTREAM: efi: stub: add implementation of efi_random_alloc() This implements efi_random_alloc(), which allocates a chunk of memory of a certain size at a certain alignment, and uses the random_seed argument it receives to randomize the address of the allocation. This is implemented by iterating over the UEFI memory map, counting the number of suitable slots (aligned offsets) within each region, and picking a random number between 0 and 'number of slots - 1' to select the slot, This should guarantee that each possible offset is chosen equally likely. Suggested-by: Kees Cook Reviewed-by: Matt Fleming Reviewed-by: Kees Cook Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 2ddbfc81eac84a299cb4747a8764bc43f23e9008) Signed-off-by: Jeff Vander Stoep Change-Id: I8f59e3e91a71c752d69fd08ca43a890977c82919 --- drivers/firmware/efi/libstub/efistub.h | 4 + drivers/firmware/efi/libstub/random.c | 100 +++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 206b7252b9d1..5ed3d3f38166 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -46,4 +46,8 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size, efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table, unsigned long size, u8 *out); +efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg, + unsigned long size, unsigned long align, + unsigned long *addr, unsigned long random_seed); + #endif diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c index 97941ee5954f..53f6d3fe6d86 100644 --- a/drivers/firmware/efi/libstub/random.c +++ b/drivers/firmware/efi/libstub/random.c @@ -33,3 +33,103 @@ efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table_arg, return rng->get_rng(rng, NULL, size, out); } + +/* + * Return the number of slots covered by this entry, i.e., the number of + * addresses it covers that are suitably aligned and supply enough room + * for the allocation. + */ +static unsigned long get_entry_num_slots(efi_memory_desc_t *md, + unsigned long size, + unsigned long align) +{ + u64 start, end; + + if (md->type != EFI_CONVENTIONAL_MEMORY) + return 0; + + start = round_up(md->phys_addr, align); + end = round_down(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - size, + align); + + if (start > end) + return 0; + + return (end - start + 1) / align; +} + +/* + * The UEFI memory descriptors have a virtual address field that is only used + * when installing the virtual mapping using SetVirtualAddressMap(). Since it + * is unused here, we can reuse it to keep track of each descriptor's slot + * count. + */ +#define MD_NUM_SLOTS(md) ((md)->virt_addr) + +efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg, + unsigned long size, + unsigned long align, + unsigned long *addr, + unsigned long random_seed) +{ + unsigned long map_size, desc_size, total_slots = 0, target_slot; + efi_status_t status; + efi_memory_desc_t *memory_map; + int map_offset; + + status = efi_get_memory_map(sys_table_arg, &memory_map, &map_size, + &desc_size, NULL, NULL); + if (status != EFI_SUCCESS) + return status; + + if (align < EFI_ALLOC_ALIGN) + align = EFI_ALLOC_ALIGN; + + /* count the suitable slots in each memory map entry */ + for (map_offset = 0; map_offset < map_size; map_offset += desc_size) { + efi_memory_desc_t *md = (void *)memory_map + map_offset; + unsigned long slots; + + slots = get_entry_num_slots(md, size, align); + MD_NUM_SLOTS(md) = slots; + total_slots += slots; + } + + /* find a random number between 0 and total_slots */ + target_slot = (total_slots * (u16)random_seed) >> 16; + + /* + * target_slot is now a value in the range [0, total_slots), and so + * it corresponds with exactly one of the suitable slots we recorded + * when iterating over the memory map the first time around. + * + * So iterate over the memory map again, subtracting the number of + * slots of each entry at each iteration, until we have found the entry + * that covers our chosen slot. Use the residual value of target_slot + * to calculate the randomly chosen address, and allocate it directly + * using EFI_ALLOCATE_ADDRESS. + */ + for (map_offset = 0; map_offset < map_size; map_offset += desc_size) { + efi_memory_desc_t *md = (void *)memory_map + map_offset; + efi_physical_addr_t target; + unsigned long pages; + + if (target_slot >= MD_NUM_SLOTS(md)) { + target_slot -= MD_NUM_SLOTS(md); + continue; + } + + target = round_up(md->phys_addr, align) + target_slot * align; + pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + + status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS, + EFI_LOADER_DATA, pages, &target); + if (status == EFI_SUCCESS) + *addr = target; + break; + } + + efi_call_early(free_pool, memory_map); + + return status; +} From 3fafb1a67d5261cd8a0d9a9ad0316baf98d0bba8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 11:47:49 +0100 Subject: [PATCH 0445/3220] UPSTREAM: efi: stub: use high allocation for converted command line Before we can move the command line processing before the allocation of the kernel, which is required for detecting the 'nokaslr' option which controls that allocation, move the converted command line higher up in memory, to prevent it from interfering with the kernel itself. Since x86 needs the address to fit in 32 bits, use UINT_MAX as the upper bound there. Otherwise, use ULONG_MAX (i.e., no limit) Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 48fcb2d0216103d15306caa4814e2381104df6d8) Signed-off-by: Jeff Vander Stoep Change-Id: Ie959355658d3f2f1819bee842c77cc5eef54b8e7 --- arch/x86/include/asm/efi.h | 2 ++ drivers/firmware/efi/libstub/efi-stub-helper.c | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 0010c78c4998..08b1f2f6ea50 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -25,6 +25,8 @@ #define EFI32_LOADER_SIGNATURE "EL32" #define EFI64_LOADER_SIGNATURE "EL64" +#define MAX_CMDLINE_ADDRESS UINT_MAX + #ifdef CONFIG_X86_32 diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index f07d4a67fa76..29ed2f9b218c 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -649,6 +649,10 @@ static u8 *efi_utf16_to_utf8(u8 *dst, const u16 *src, int n) return dst; } +#ifndef MAX_CMDLINE_ADDRESS +#define MAX_CMDLINE_ADDRESS ULONG_MAX +#endif + /* * Convert the unicode UEFI command line to ASCII to pass to kernel. * Size of memory allocated return in *cmd_line_len. @@ -684,7 +688,8 @@ char *efi_convert_cmdline(efi_system_table_t *sys_table_arg, options_bytes++; /* NUL termination */ - status = efi_low_alloc(sys_table_arg, options_bytes, 0, &cmdline_addr); + status = efi_high_alloc(sys_table_arg, options_bytes, 0, + &cmdline_addr, MAX_CMDLINE_ADDRESS); if (status != EFI_SUCCESS) return NULL; From 4de18b705de63c094a0c23754337c750d072ade9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 26 Jan 2016 14:48:29 +0100 Subject: [PATCH 0446/3220] UPSTREAM: arm64: efi: invoke EFI_RNG_PROTOCOL to supply KASLR randomness Since arm64 does not use a decompressor that supplies an execution environment where it is feasible to some extent to provide a source of randomness, the arm64 KASLR kernel depends on the bootloader to supply some random bits in the /chosen/kaslr-seed DT property upon kernel entry. On UEFI systems, we can use the EFI_RNG_PROTOCOL, if supplied, to obtain some random bits. At the same time, use it to randomize the offset of the kernel Image in physical memory. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 2b5fe07a78a09a32002642b8a823428ade611f16) Signed-off-by: Jeff Vander Stoep Change-Id: I9cb7ae5727dfdf3726b1c9544bce74722ec77bbd --- arch/arm64/Kconfig | 5 ++ drivers/firmware/efi/libstub/arm-stub.c | 40 +++++++---- drivers/firmware/efi/libstub/arm64-stub.c | 84 ++++++++++++++++------- drivers/firmware/efi/libstub/fdt.c | 14 ++++ 4 files changed, 105 insertions(+), 38 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a42e3235484b..0b5ec89a8ec4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -794,6 +794,11 @@ config RANDOMIZE_BASE It is the bootloader's job to provide entropy, by passing a random u64 value in /chosen/kaslr-seed at kernel entry. + When booting via the UEFI stub, it will invoke the firmware's + EFI_RNG_PROTOCOL implementation (if available) to supply entropy + to the kernel proper. In addition, it will randomise the physical + location of the kernel Image as well. + If unsure, say N. config RANDOMIZE_MODULE_REGION_FULL diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 950c87f5d279..d5aa1d16154f 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -18,6 +18,8 @@ #include "efistub.h" +bool __nokaslr; + static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg) { static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID; @@ -207,14 +209,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, pr_efi_err(sys_table, "Failed to find DRAM base\n"); goto fail; } - status = handle_kernel_image(sys_table, image_addr, &image_size, - &reserve_addr, - &reserve_size, - dram_base, image); - if (status != EFI_SUCCESS) { - pr_efi_err(sys_table, "Failed to relocate kernel\n"); - goto fail; - } /* * Get the command line from EFI, using the LOADED_IMAGE @@ -224,7 +218,28 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, cmdline_ptr = efi_convert_cmdline(sys_table, image, &cmdline_size); if (!cmdline_ptr) { pr_efi_err(sys_table, "getting command line via LOADED_IMAGE_PROTOCOL\n"); - goto fail_free_image; + goto fail; + } + + /* check whether 'nokaslr' was passed on the command line */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + static const u8 default_cmdline[] = CONFIG_CMDLINE; + const u8 *str, *cmdline = cmdline_ptr; + + if (IS_ENABLED(CONFIG_CMDLINE_FORCE)) + cmdline = default_cmdline; + str = strstr(cmdline, "nokaslr"); + if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) + __nokaslr = true; + } + + status = handle_kernel_image(sys_table, image_addr, &image_size, + &reserve_addr, + &reserve_size, + dram_base, image); + if (status != EFI_SUCCESS) { + pr_efi_err(sys_table, "Failed to relocate kernel\n"); + goto fail_free_cmdline; } status = efi_parse_options(cmdline_ptr); @@ -244,7 +259,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, if (status != EFI_SUCCESS) { pr_efi_err(sys_table, "Failed to load device tree!\n"); - goto fail_free_cmdline; + goto fail_free_image; } } @@ -286,12 +301,11 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, efi_free(sys_table, initrd_size, initrd_addr); efi_free(sys_table, fdt_size, fdt_addr); -fail_free_cmdline: - efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr); - fail_free_image: efi_free(sys_table, image_size, *image_addr); efi_free(sys_table, reserve_size, reserve_addr); +fail_free_cmdline: + efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr); fail: return EFI_ERROR; } diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 78dfbd34b6bf..e0e6b74fef8f 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -13,6 +13,10 @@ #include #include +#include "efistub.h" + +extern bool __nokaslr; + efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg, unsigned long *image_addr, unsigned long *image_size, @@ -23,26 +27,52 @@ efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg, { efi_status_t status; unsigned long kernel_size, kernel_memsize = 0; - unsigned long nr_pages; void *old_image_addr = (void *)*image_addr; unsigned long preferred_offset; + u64 phys_seed = 0; + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + if (!__nokaslr) { + status = efi_get_random_bytes(sys_table_arg, + sizeof(phys_seed), + (u8 *)&phys_seed); + if (status == EFI_NOT_FOUND) { + pr_efi(sys_table_arg, "EFI_RNG_PROTOCOL unavailable, no randomness supplied\n"); + } else if (status != EFI_SUCCESS) { + pr_efi_err(sys_table_arg, "efi_get_random_bytes() failed\n"); + return status; + } + } else { + pr_efi(sys_table_arg, "KASLR disabled on kernel command line\n"); + } + } /* * The preferred offset of the kernel Image is TEXT_OFFSET bytes beyond * a 2 MB aligned base, which itself may be lower than dram_base, as * long as the resulting offset equals or exceeds it. */ - preferred_offset = round_down(dram_base, SZ_2M) + TEXT_OFFSET; + preferred_offset = round_down(dram_base, MIN_KIMG_ALIGN) + TEXT_OFFSET; if (preferred_offset < dram_base) - preferred_offset += SZ_2M; + preferred_offset += MIN_KIMG_ALIGN; - /* Relocate the image, if required. */ kernel_size = _edata - _text; - if (*image_addr != preferred_offset) { - kernel_memsize = kernel_size + (_end - _edata); + kernel_memsize = kernel_size + (_end - _edata); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && phys_seed != 0) { /* - * First, try a straight allocation at the preferred offset. + * If KASLR is enabled, and we have some randomness available, + * locate the kernel at a randomized offset in physical memory. + */ + *reserve_size = kernel_memsize + TEXT_OFFSET; + status = efi_random_alloc(sys_table_arg, *reserve_size, + MIN_KIMG_ALIGN, reserve_addr, + phys_seed); + + *image_addr = *reserve_addr + TEXT_OFFSET; + } else { + /* + * Else, try a straight allocation at the preferred offset. * This will work around the issue where, if dram_base == 0x0, * efi_low_alloc() refuses to allocate at 0x0 (to prevent the * address of the allocation to be mistaken for a FAIL return @@ -52,27 +82,31 @@ efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg, * Mustang), we can still place the kernel at the address * 'dram_base + TEXT_OFFSET'. */ - *image_addr = *reserve_addr = preferred_offset; - nr_pages = round_up(kernel_memsize, EFI_ALLOC_ALIGN) / - EFI_PAGE_SIZE; - status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS, - EFI_LOADER_DATA, nr_pages, - (efi_physical_addr_t *)reserve_addr); - if (status != EFI_SUCCESS) { - kernel_memsize += TEXT_OFFSET; - status = efi_low_alloc(sys_table_arg, kernel_memsize, - SZ_2M, reserve_addr); + if (*image_addr == preferred_offset) + return EFI_SUCCESS; - if (status != EFI_SUCCESS) { - pr_efi_err(sys_table_arg, "Failed to relocate kernel\n"); - return status; - } - *image_addr = *reserve_addr + TEXT_OFFSET; - } - memcpy((void *)*image_addr, old_image_addr, kernel_size); - *reserve_size = kernel_memsize; + *image_addr = *reserve_addr = preferred_offset; + *reserve_size = round_up(kernel_memsize, EFI_ALLOC_ALIGN); + + status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS, + EFI_LOADER_DATA, + *reserve_size / EFI_PAGE_SIZE, + (efi_physical_addr_t *)reserve_addr); } + if (status != EFI_SUCCESS) { + *reserve_size = kernel_memsize + TEXT_OFFSET; + status = efi_low_alloc(sys_table_arg, *reserve_size, + MIN_KIMG_ALIGN, reserve_addr); + + if (status != EFI_SUCCESS) { + pr_efi_err(sys_table_arg, "Failed to relocate kernel\n"); + *reserve_size = 0; + return status; + } + *image_addr = *reserve_addr + TEXT_OFFSET; + } + memcpy((void *)*image_addr, old_image_addr, kernel_size); return EFI_SUCCESS; } diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index b62e2f5dcab3..b1c22cf18f7d 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -147,6 +147,20 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt, if (status) goto fdt_set_fail; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + efi_status_t efi_status; + + efi_status = efi_get_random_bytes(sys_table, sizeof(fdt_val64), + (u8 *)&fdt_val64); + if (efi_status == EFI_SUCCESS) { + status = fdt_setprop(fdt, node, "kaslr-seed", + &fdt_val64, sizeof(fdt_val64)); + if (status) + goto fdt_set_fail; + } else if (efi_status != EFI_NOT_FOUND) { + return efi_status; + } + } return EFI_SUCCESS; fdt_set_fail: From a2f0639adbc63164137683f6d0c4deb88406290d Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 11 Feb 2016 13:53:10 -0800 Subject: [PATCH 0447/3220] UPSTREAM: arm64: make irq_stack_ptr more robust Switching between stacks is only valid if we are tracing ourselves while on the irq_stack, so it is only valid when in current and non-preemptible context, otherwise is is just zeroed off. Fixes: 132cd887b5c5 ("arm64: Modify stack trace and dump for use with irq_stack") Acked-by: James Morse Tested-by: James Morse Signed-off-by: Yang Shi Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit a80a0eb70c358f8c7dda4bb62b2278dc6285217b) Signed-off-by: Jeff Vander Stoep Change-Id: I431d3d5e8e1f556ddfef283af88dd2f63b825f7c --- arch/arm64/kernel/stacktrace.c | 13 ++++++------- arch/arm64/kernel/traps.c | 11 ++++++++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 4fad9787ab46..cfd46c227c8c 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -44,14 +44,13 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) unsigned long irq_stack_ptr; /* - * Use raw_smp_processor_id() to avoid false-positives from - * CONFIG_DEBUG_PREEMPT. get_wchan() calls unwind_frame() on sleeping - * task stacks, we can be pre-empted in this case, so - * {raw_,}smp_processor_id() may give us the wrong value. Sleeping - * tasks can't ever be on an interrupt stack, so regardless of cpu, - * the checks will always fail. + * Switching between stacks is valid when tracing current and in + * non-preemptible context. */ - irq_stack_ptr = IRQ_STACK_PTR(raw_smp_processor_id()); + if (tsk == current && !preemptible()) + irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + else + irq_stack_ptr = 0; low = frame->sp; /* irq stacks are not THREAD_SIZE aligned */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index cbedd724f48e..c5392081b49b 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -146,9 +146,18 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + unsigned long irq_stack_ptr; int skip; + /* + * Switching between stacks is valid when tracing current and in + * non-preemptible context. + */ + if (tsk == current && !preemptible()) + irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + else + irq_stack_ptr = 0; + pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); if (!tsk) From 46debe0769bb9597eaeb460f8dd0c3a85ad76af1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Mar 2016 15:22:55 +0000 Subject: [PATCH 0448/3220] UPSTREAM: arm64: hugetlb: partial revert of 66b3923a1a0f Commit 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") introduced support for huge pages using the contiguous bit in the PTE as opposed to block mappings, which may be slightly unwieldy (512M) in 64k page configurations. Unfortunately, this support has resulted in some late regressions when running the libhugetlbfs test suite with 64k pages and CONFIG_DEBUG_VM as a result of a BUG: | readback (2M: 64): ------------[ cut here ]------------ | kernel BUG at fs/hugetlbfs/inode.c:446! | Internal error: Oops - BUG: 0 [#1] SMP | Modules linked in: | CPU: 7 PID: 1448 Comm: readback Not tainted 4.5.0-rc7 #148 | Hardware name: linux,dummy-virt (DT) | task: fffffe0040964b00 ti: fffffe00c2668000 task.ti: fffffe00c2668000 | PC is at remove_inode_hugepages+0x44c/0x480 | LR is at remove_inode_hugepages+0x264/0x480 Rather than revert the entire patch, simply avoid advertising the contiguous huge page sizes for now while people are actively working on a fix. This patch can then be reverted once things have been sorted out. Cc: David Woods Reported-by: Steve Capper Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit ff7925848b50050732ac0401e0acf27e8b241d7b) Signed-off-by: Jeff Vander Stoep Change-Id: I3a9751fa79b2d2871dbdc06ea1aa3d1336bb4f4f --- arch/arm64/mm/hugetlbpage.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 82d607c3614e..da30529bb1f6 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -306,10 +306,6 @@ static __init int setup_hugepagesz(char *opt) hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); } else if (ps == PUD_SIZE) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else if (ps == (PAGE_SIZE * CONT_PTES)) { - hugetlb_add_hstate(CONT_PTE_SHIFT); - } else if (ps == (PMD_SIZE * CONT_PMDS)) { - hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT); } else { pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); return 0; @@ -317,13 +313,3 @@ static __init int setup_hugepagesz(char *opt) return 1; } __setup("hugepagesz=", setup_hugepagesz); - -#ifdef CONFIG_ARM64_64K_PAGES -static __init int add_default_hugepagesz(void) -{ - if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) - hugetlb_add_hstate(CONT_PMD_SHIFT); - return 0; -} -arch_initcall(add_default_hugepagesz); -#endif From b6d54c720c294dd973ddff2d111a4f3a0d35dc74 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 15 Mar 2016 11:22:57 +0000 Subject: [PATCH 0449/3220] UPSTREAM: arm64: fix KASLR boot-time I-cache maintenance Commit f80fb3a3d50843a4 ("arm64: add support for kernel ASLR") missed a DSB necessary to complete I-cache maintenance in the primary boot path, and hence stale instructions may still be present in the I-cache and may be executed until the I-cache maintenance naturally completes. Since commit 8ec41987436d566f ("arm64: mm: ensure patched kernel text is fetched from PoU"), all CPUs invalidate their I-caches after their MMU is enabled. Prior a CPU's MMU having been enabled, arbitrary lines may have been fetched from the PoC into I-caches. We never patch text expected to be executed with the MMU off. Thus, it is unnecessary to perform broadcast I-cache maintenance in the primary boot path. This patch reduces the scope of the I-cache maintenance to the local CPU, and adds the missing DSB with similar scope, matching prior maintenance in the primary boot path. Signed-off-by: Mark Rutland Acked-by: Ard Biesehvuel Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit b90b4a608ea2401cc491828f7a385edd2e236e37) Signed-off-by: Jeff Vander Stoep Change-Id: Ic66b5fec29867b86782ad6c3243642afc1f40080 --- arch/arm64/kernel/head.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 319f896c6e74..a88a15447c3b 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -740,8 +740,9 @@ __enable_mmu: msr sctlr_el1, x19 // re-enable the MMU isb - ic ialluis // flush instructions fetched - isb // via old mapping + ic iallu // flush instructions fetched + dsb nsh // via old mapping + isb add x27, x27, x23 // relocated __mmap_switched #endif br x27 From 25fd795692e05141f36099f778c054a86d586812 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 22 Mar 2016 10:11:45 +0000 Subject: [PATCH 0450/3220] UPSTREAM: arm64: consistently use p?d_set_huge Commit 324420bf91f60582 ("arm64: add support for ioremap() block mappings") added new p?d_set_huge functions which do the hard work to generate and set a correct block entry. These differ from open-coded huge page creation in the early page table code by explicitly setting the P?D_TYPE_SECT bits (which are implicitly retained by mk_sect_prot() for any valid prot), but are otherwise identical (and cannot fail on arm64). For simplicity and consistency, make use of these in the initial page table creation code. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit c661cb1c537e2364bfdabb298fb934fd77445e98) Signed-off-by: Jeff Vander Stoep Change-Id: I25e58a1626831c2c709abcded989d1770fea851c --- arch/arm64/mm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f4d1da0c1531..72e634533a3f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -211,8 +211,7 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, if (((addr | next | phys) & ~SECTION_MASK) == 0 && block_mappings_allowed(pgtable_alloc)) { pmd_t old_pmd =*pmd; - set_pmd(pmd, __pmd(phys | - pgprot_val(mk_sect_prot(prot)))); + pmd_set_huge(pmd, phys, prot); /* * Check for previous table entries created during * boot (__create_page_tables) and flush them. @@ -272,8 +271,7 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, if (use_1G_block(addr, next, phys) && block_mappings_allowed(pgtable_alloc)) { pud_t old_pud = *pud; - set_pud(pud, __pud(phys | - pgprot_val(mk_sect_prot(prot)))); + pud_set_huge(pud, phys, prot); /* * If we have an old value for a pud, it will From 1c4143d40cec48cac343bf1e8a426746ac240f92 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 10 Mar 2016 18:41:16 +0000 Subject: [PATCH 0451/3220] UPSTREAM: arm64: kasan: Fix zero shadow mapping overriding kernel image shadow With the 16KB and 64KB page size configurations, SWAPPER_BLOCK_SIZE is PAGE_SIZE and ARM64_SWAPPER_USES_SECTION_MAPS is 0. Since kimg_shadow_end is not page aligned (_end shifted by KASAN_SHADOW_SCALE_SHIFT), the edges of previously mapped kernel image shadow via vmemmap_populate() may be overridden by subsequent calls to kasan_populate_zero_shadow(), leading to kernel panics like below: ------------------------------------------------------------------------------ Unable to handle kernel paging request at virtual address fffffc100135068c pgd = fffffc8009ac0000 [fffffc100135068c] *pgd=00000009ffee0003, *pud=00000009ffee0003, *pmd=00000009ffee0003, *pte=00e0000081a00793 Internal error: Oops: 9600004f [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.5.0-rc4+ #1984 Hardware name: Juno (DT) task: fffffe09001a0000 ti: fffffe0900200000 task.ti: fffffe0900200000 PC is at __memset+0x4c/0x200 LR is at kasan_unpoison_shadow+0x34/0x50 pc : [] lr : [] pstate: 00000245 sp : fffffe0900203db0 x29: fffffe0900203db0 x28: 0000000000000000 x27: 0000000000000000 x26: 0000000000000000 x25: fffffc80099b69d0 x24: 0000000000000001 x23: 0000000000000000 x22: 0000000000002000 x21: dffffc8000000000 x20: 1fffff9001350a8c x19: 0000000000002000 x18: 0000000000000008 x17: 0000000000000147 x16: ffffffffffffffff x15: 79746972100e041d x14: ffffff0000000000 x13: ffff000000000000 x12: 0000000000000000 x11: 0101010101010101 x10: 1fffffc11c000000 x9 : 0000000000000000 x8 : fffffc100135068c x7 : 0000000000000000 x6 : 000000000000003f x5 : 0000000000000040 x4 : 0000000000000004 x3 : fffffc100134f651 x2 : 0000000000000400 x1 : 0000000000000000 x0 : fffffc100135068c Process swapper/0 (pid: 1, stack limit = 0xfffffe0900200020) Call trace: [] __memset+0x4c/0x200 [] __asan_register_globals+0x5c/0xb0 [] _GLOBAL__sub_I_65535_1_sunrpc_cache_lookup+0x1c/0x28 [] kernel_init_freeable+0x104/0x274 [] kernel_init+0x10/0xf8 [] ret_from_fork+0x10/0x50 ------------------------------------------------------------------------------ This patch aligns kimg_shadow_start and kimg_shadow_end to SWAPPER_BLOCK_SIZE in all configurations. Fixes: f9040773b7bb ("arm64: move kernel image to base of vmalloc area") Signed-off-by: Catalin Marinas Acked-by: Mark Rutland Acked-by: Ard Biesheuvel Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 2776e0e8ef683a42fe3e9a5facf576b73579700e) Signed-off-by: Jeff Vander Stoep Change-Id: I13a6b38aefbeddd20bc87cb1382f2787bbc5cf9c --- arch/arm64/mm/kasan_init.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 56e19d150c21..206dd95ea292 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -158,15 +158,12 @@ void __init kasan_init(void) * vmemmap_populate() has populated the shadow region that covers the * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent - * kasan_populate_zero_shadow() from replacing the PMD block mappings - * with PMD table mappings at the edges of the shadow region for the - * kernel image. + * kasan_populate_zero_shadow() from replacing the page table entries + * (PMD or PTE) at the edges of the shadow region for the kernel + * image. */ - if (ARM64_SWAPPER_USES_SECTION_MAPS) { - kimg_shadow_start = round_down(kimg_shadow_start, - SWAPPER_BLOCK_SIZE); - kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); - } + kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE); + kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, (void *)mod_shadow_start); From f41890295d6496ed50bbe8606d23fb10661a844f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 2 Mar 2016 09:47:13 +0100 Subject: [PATCH 0452/3220] UPSTREAM: arm64: mm: check at build time that PAGE_OFFSET divides the VA space evenly Commit 8439e62a1561 ("arm64: mm: use bit ops rather than arithmetic in pa/va translations") changed the boundary check against PAGE_OFFSET from an arithmetic comparison to a bit test. This means we now silently assume that PAGE_OFFSET is a power of 2 that divides the kernel virtual address space into two equal halves. So make that assumption explicit. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 6d2aa549de1fc998581d216de3853aa131aa4446) Signed-off-by: Jeff Vander Stoep Change-Id: I8c3bc8cdb7d7f7dea092fd1a208b04583a141054 --- arch/arm64/mm/init.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 8e3cf19e4f00..2c38be3df4c8 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -172,6 +172,13 @@ void __init arm64_memblock_init(void) { const s64 linear_region_size = -(s64)PAGE_OFFSET; + /* + * Ensure that the linear region takes up exactly half of the kernel + * virtual address space. This way, we can distinguish a linear address + * from a kernel/module/vmalloc address by testing a single bit. + */ + BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1)); + /* * Select a suitable value for the base of physical memory. */ From 391ba6bf3e866b9deb4b0f1d2f52c0ede0acce5a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 25 Feb 2016 20:48:53 +0100 Subject: [PATCH 0453/3220] UPSTREAM: arm64: lse: deal with clobbered IP registers after branch via PLT The LSE atomics implementation uses runtime patching to patch in calls to out of line non-LSE atomics implementations on cores that lack hardware support for LSE. To avoid paying the overhead cost of a function call even if no call ends up being made, the bl instruction is kept invisible to the compiler, and the out of line implementations preserve all registers, not just the ones that they are required to preserve as per the AAPCS64. However, commit fd045f6cd98e ("arm64: add support for module PLTs") added support for routing branch instructions via veneers if the branch target offset exceeds the range of the ordinary relative branch instructions. Since this deals with jump and call instructions that are exposed to ELF relocations, the PLT code uses x16 to hold the address of the branch target when it performs an indirect branch-to-register, something which is explicitly allowed by the AAPCS64 (and ordinary compiler generated code does not expect register x16 or x17 to retain their values across a bl instruction). Since the lse runtime patched bl instructions don't adhere to the AAPCS64, they don't deal with this clobbering of registers x16 and x17. So add them to the clobber list of the asm() statements that perform the call instructions, and drop x16 and x17 from the list of registers that are callee saved in the out of line non-LSE implementations. In addition, since we have given these functions two scratch registers, they no longer need to stack/unstack temp registers. Signed-off-by: Ard Biesheuvel [will: factored clobber list into #define, updated Makefile comment] Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 5be8b70af1ca78cefb8b756d157532360a5fd663) Signed-off-by: Jeff Vander Stoep Change-Id: Ia44a54eba315a47a6b8aaa2259b444e0139162c0 --- arch/arm64/include/asm/atomic_lse.h | 38 ++++++++++++++--------------- arch/arm64/include/asm/lse.h | 1 + arch/arm64/lib/Makefile | 13 +++++----- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h index 197e06afbf71..39c1d340fec5 100644 --- a/arch/arm64/include/asm/atomic_lse.h +++ b/arch/arm64/include/asm/atomic_lse.h @@ -36,7 +36,7 @@ static inline void atomic_andnot(int i, atomic_t *v) " stclr %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_or(int i, atomic_t *v) @@ -48,7 +48,7 @@ static inline void atomic_or(int i, atomic_t *v) " stset %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_xor(int i, atomic_t *v) @@ -60,7 +60,7 @@ static inline void atomic_xor(int i, atomic_t *v) " steor %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_add(int i, atomic_t *v) @@ -72,7 +72,7 @@ static inline void atomic_add(int i, atomic_t *v) " stadd %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC_OP_ADD_RETURN(name, mb, cl...) \ @@ -90,7 +90,7 @@ static inline int atomic_add_return##name(int i, atomic_t *v) \ " add %w[i], %w[i], w30") \ : [i] "+r" (w0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return w0; \ } @@ -116,7 +116,7 @@ static inline void atomic_and(int i, atomic_t *v) " stclr %w[i], %[v]") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_sub(int i, atomic_t *v) @@ -133,7 +133,7 @@ static inline void atomic_sub(int i, atomic_t *v) " stadd %w[i], %[v]") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC_OP_SUB_RETURN(name, mb, cl...) \ @@ -153,7 +153,7 @@ static inline int atomic_sub_return##name(int i, atomic_t *v) \ " add %w[i], %w[i], w30") \ : [i] "+r" (w0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS , ##cl); \ \ return w0; \ } @@ -177,7 +177,7 @@ static inline void atomic64_andnot(long i, atomic64_t *v) " stclr %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_or(long i, atomic64_t *v) @@ -189,7 +189,7 @@ static inline void atomic64_or(long i, atomic64_t *v) " stset %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_xor(long i, atomic64_t *v) @@ -201,7 +201,7 @@ static inline void atomic64_xor(long i, atomic64_t *v) " steor %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_add(long i, atomic64_t *v) @@ -213,7 +213,7 @@ static inline void atomic64_add(long i, atomic64_t *v) " stadd %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC64_OP_ADD_RETURN(name, mb, cl...) \ @@ -231,7 +231,7 @@ static inline long atomic64_add_return##name(long i, atomic64_t *v) \ " add %[i], %[i], x30") \ : [i] "+r" (x0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } @@ -257,7 +257,7 @@ static inline void atomic64_and(long i, atomic64_t *v) " stclr %[i], %[v]") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_sub(long i, atomic64_t *v) @@ -274,7 +274,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) " stadd %[i], %[v]") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC64_OP_SUB_RETURN(name, mb, cl...) \ @@ -294,7 +294,7 @@ static inline long atomic64_sub_return##name(long i, atomic64_t *v) \ " add %[i], %[i], x30") \ : [i] "+r" (x0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } @@ -330,7 +330,7 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) "2:") : [ret] "+&r" (x0), [v] "+Q" (v->counter) : - : "x30", "cc", "memory"); + : __LL_SC_CLOBBERS, "cc", "memory"); return x0; } @@ -359,7 +359,7 @@ static inline unsigned long __cmpxchg_case_##name(volatile void *ptr, \ " mov %" #w "[ret], " #w "30") \ : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr) \ : [old] "r" (x1), [new] "r" (x2) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } @@ -416,7 +416,7 @@ static inline long __cmpxchg_double##name(unsigned long old1, \ [v] "+Q" (*(unsigned long *)ptr) \ : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ [oldval1] "r" (oldval1), [oldval2] "r" (oldval2) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h index 3de42d68611d..23acc00be32d 100644 --- a/arch/arm64/include/asm/lse.h +++ b/arch/arm64/include/asm/lse.h @@ -26,6 +26,7 @@ __asm__(".arch_extension lse"); /* Macro for constructing calls to out-of-line ll/sc atomics */ #define __LL_SC_CALL(op) "bl\t" __stringify(__LL_SC_PREFIX(op)) "\n" +#define __LL_SC_CLOBBERS "x16", "x17", "x30" /* In-line patching at runtime */ #define ARM64_LSE_ATOMIC_INSN(llsc, lse) \ diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 1a811ecf71da..c86b7909ef31 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -4,15 +4,16 @@ lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ strchr.o strrchr.o -# Tell the compiler to treat all general purpose registers as -# callee-saved, which allows for efficient runtime patching of the bl -# instruction in the caller with an atomic instruction when supported by -# the CPU. Result and argument registers are handled correctly, based on -# the function prototype. +# Tell the compiler to treat all general purpose registers (with the +# exception of the IP registers, which are already handled by the caller +# in case of a PLT) as callee-saved, which allows for efficient runtime +# patching of the bl instruction in the caller with an atomic instruction +# when supported by the CPU. Result and argument registers are handled +# correctly, based on the function prototype. lib-$(CONFIG_ARM64_LSE_ATOMICS) += atomic_ll_sc.o CFLAGS_atomic_ll_sc.o := -fcall-used-x0 -ffixed-x1 -ffixed-x2 \ -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6 \ -ffixed-x7 -fcall-saved-x8 -fcall-saved-x9 \ -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ - -fcall-saved-x16 -fcall-saved-x17 -fcall-saved-x18 + -fcall-saved-x18 From 1d2e2e3c0f8f343e4e74cdd89fb94fc341578e64 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 10 Mar 2016 18:30:56 +0000 Subject: [PATCH 0454/3220] UPSTREAM: arm64: kasan: Use actual memory node when populating the kernel image shadow With the 16KB or 64KB page configurations, the generic vmemmap_populate() implementation warns on potential offnode page_structs via vmemmap_verify() because the arm64 kasan_init() passes NUMA_NO_NODE instead of the actual node for the kernel image memory. Fixes: f9040773b7bb ("arm64: move kernel image to base of vmalloc area") Signed-off-by: Catalin Marinas Reported-by: James Morse Acked-by: Ard Biesheuvel Acked-by: Mark Rutland Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 2f76969f2eef051bdd63d38b08d78e790440b0ad) Signed-off-by: Jeff Vander Stoep Change-Id: I8985e5b4628a9c7076767d4565f7633635813b5c --- arch/arm64/mm/kasan_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 206dd95ea292..757009daa9ed 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -152,7 +152,8 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); - vmemmap_populate(kimg_shadow_start, kimg_shadow_end, NUMA_NO_NODE); + vmemmap_populate(kimg_shadow_start, kimg_shadow_end, + pfn_to_nid(virt_to_pfn(_text))); /* * vmemmap_populate() has populated the shadow region that covers the From 42f010ac16ad91ca029ecb77dbf18ca95d2b9598 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 26 Jul 2016 10:16:55 -0700 Subject: [PATCH 0455/3220] UPSTREAM: arm64: Only select ARM64_MODULE_PLTS if MODULES=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Selecting CONFIG_RANDOMIZE_BASE=y and CONFIG_MODULES=n fails to build the module PLTs support: CC arch/arm64/kernel/module-plts.o /work/Linux/linux-2.6-aarch64/arch/arm64/kernel/module-plts.c: In function ‘module_emit_plt_entry’: /work/Linux/linux-2.6-aarch64/arch/arm64/kernel/module-plts.c:32:49: error: dereferencing pointer to incomplete type ‘struct module’ This patch selects ARM64_MODULE_PLTS conditionally only if MODULES is enabled. Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR") Cc: # 4.6+ Reported-by: Jeff Vander Stoep Acked-by: Ard Biesheuvel Acked-by: Mark Rutland Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit b9c220b589daaf140f5b8ebe502c98745b94e65c) Signed-off-by: Jeff Vander Stoep Change-Id: I446cb3aa78f1c64b5aa1e2e90fda13f7d46cac33 --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0b5ec89a8ec4..bb92d3875791 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -784,7 +784,7 @@ config RELOCATABLE config RANDOMIZE_BASE bool "Randomize the address of the kernel image" - select ARM64_MODULE_PLTS + select ARM64_MODULE_PLTS if MODULES select RELOCATABLE help Randomizes the virtual address at which the kernel image is From 1488d59cb5fd80dea0abb78bafb5130b64f561db Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 24 Aug 2016 18:02:08 +0100 Subject: [PATCH 0456/3220] UPSTREAM: arm64: avoid TLB conflict with CONFIG_RANDOMIZE_BASE When CONFIG_RANDOMIZE_BASE is selected, we modify the page tables to remap the kernel at a newly-chosen VA range. We do this with the MMU disabled, but do not invalidate TLBs prior to re-enabling the MMU with the new tables. Thus the old mappings entries may still live in TLBs, and we risk violating Break-Before-Make requirements, leading to TLB conflicts and/or other issues. We invalidate TLBs when we uninsall the idmap in early setup code, but prior to this we are subject to issues relating to the Break-Before-Make violation. Avoid these issues by invalidating the TLBs before the new mappings can be used by the hardware. Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR") Cc: # 4.6+ Acked-by: Ard Biesheuvel Acked-by: Will Deacon Signed-off-by: Mark Rutland Signed-off-by: Catalin Marinas Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit fd363bd417ddb6103564c69cfcbd92d9a7877431) Signed-off-by: Jeff Vander Stoep Change-Id: I6c23ce55cdd8b66587b6787b8f28df8535e39f24 --- arch/arm64/kernel/head.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index a88a15447c3b..54fd1e2590a2 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -738,6 +738,9 @@ __enable_mmu: isb bl __create_page_tables // recreate kernel mapping + tlbi vmalle1 // Remove any stale TLB entries + dsb nsh + msr sctlr_el1, x19 // re-enable the MMU isb ic iallu // flush instructions fetched From 542ecd89801bcab7306825c0fbc6231cd67e170f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 8 Jun 2016 15:10:57 +0100 Subject: [PATCH 0457/3220] UPSTREAM: arm64: spinlock: fix spin_unlock_wait for LSE atomics Commit d86b8da04dfa ("arm64: spinlock: serialise spin_unlock_wait against concurrent lockers") fixed spin_unlock_wait for LL/SC-based atomics under the premise that the LSE atomics (in particular, the LDADDA instruction) are indivisible. Unfortunately, these instructions are only indivisible when used with the -AL (full ordering) suffix and, consequently, the same issue can theoretically be observed with LSE atomics, where a later (in program order) load can be speculated before the write portion of the atomic operation. This patch fixes the issue by performing a CAS of the lock once we've established that it's unlocked, in much the same way as the LL/SC code. Fixes: d86b8da04dfa ("arm64: spinlock: serialise spin_unlock_wait against concurrent lockers") Signed-off-by: Will Deacon Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 3a5facd09da848193f5bcb0dea098a298bc1a29d) Signed-off-by: Jeff Vander Stoep Change-Id: Icedaa4c508784bf43d0b5787586480fd668ccc49 --- arch/arm64/include/asm/spinlock.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index fc9682bfe002..deb6e5ac891f 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -37,13 +37,17 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) "2: ldaxr %w0, %2\n" " eor %w1, %w0, %w0, ror #16\n" " cbnz %w1, 1b\n" + /* Serialise against any concurrent lockers */ ARM64_LSE_ATOMIC_INSN( /* LL/SC */ " stxr %w1, %w0, %2\n" -" cbnz %w1, 2b\n", /* Serialise against any concurrent lockers */ - /* LSE atomics */ " nop\n" -" nop\n") +" nop\n", + /* LSE atomics */ +" mov %w1, %w0\n" +" cas %w0, %w0, %2\n" +" eor %w1, %w1, %w0\n") +" cbnz %w1, 2b\n" : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) : : "memory"); From 8ab85055db1054611473cfd1d72512e45b37e251 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:12 -0800 Subject: [PATCH 0458/3220] UPSTREAM: asm-generic: Consolidate mark_rodata_ro() Instead of defining mark_rodata_ro() in each architecture, consolidate it. Signed-off-by: Kees Cook Acked-by: Will Deacon Cc: Andrew Morton Cc: Andy Gross Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Ashok Kumar Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Catalin Marinas Cc: Dan Williams Cc: David Brown Cc: David Hildenbrand Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Helge Deller Cc: James E.J. Bottomley Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Marc Zyngier Cc: Mark Rutland Cc: Mathias Krause Cc: Michael Ellerman Cc: Nicolas Pitre Cc: PaX Team Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ross Zwisler Cc: Russell King Cc: Rusty Russell Cc: Stephen Boyd Cc: Thomas Gleixner Cc: Toshi Kani Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-parisc@vger.kernel.org Link: http://lkml.kernel.org/r/1455748879-21872-2-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: Iec0c44b3f5d7948954da93fba6cb57888a2709de (cherry picked from commit e267d97b83d9cecc16c54825f9f3ac7f72dc1e1e) Signed-off-by: Sami Tolvanen --- arch/arm/include/asm/cacheflush.h | 1 - arch/arm64/include/asm/cacheflush.h | 4 ---- arch/parisc/include/asm/cacheflush.h | 4 ---- arch/x86/include/asm/cacheflush.h | 1 - include/linux/init.h | 4 ++++ 5 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index d5525bfc7e3e..9156fc303afd 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -491,7 +491,6 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif #ifdef CONFIG_DEBUG_RODATA -void mark_rodata_ro(void); void set_kernel_text_rw(void); void set_kernel_text_ro(void); #else diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 7fc294c3bc5b..22dda613f9c9 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -156,8 +156,4 @@ int set_memory_rw(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); -#ifdef CONFIG_DEBUG_RODATA -void mark_rodata_ro(void); -#endif - #endif diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 845272ce9cc5..7bd69bd43a01 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -121,10 +121,6 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma } } -#ifdef CONFIG_DEBUG_RODATA -void mark_rodata_ro(void); -#endif - #include #define ARCH_HAS_KMAP diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e63aa38e85fb..c8cff75c5b21 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -92,7 +92,6 @@ void clflush_cache_range(void *addr, unsigned int size); #define mmio_flush_range(addr, size) clflush_cache_range(addr, size) #ifdef CONFIG_DEBUG_RODATA -void mark_rodata_ro(void); extern const int rodata_test_data; extern int kernel_set_to_readonly; void set_kernel_text_rw(void); diff --git a/include/linux/init.h b/include/linux/init.h index b449f378f995..aedb254abc37 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -142,6 +142,10 @@ void prepare_namespace(void); void __init load_default_modules(void); int __init init_rootfs(void); +#ifdef CONFIG_DEBUG_RODATA +void mark_rodata_ro(void); +#endif + extern void (*late_time_init)(void); extern bool initcall_debug; From 36e8afda0dc5ac915ad9a6742e5a8708ed656249 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:13 -0800 Subject: [PATCH 0459/3220] UPSTREAM: mm/init: Add 'rodata=off' boot cmdline parameter to disable read-only kernel mappings It may be useful to debug writes to the readonly sections of memory, so provide a cmdline "rodata=off" to allow for this. This can be expanded in the future to support "log" and "write" modes, but that will need to be architecture-specific. This also makes KDB software breakpoints more usable, as read-only mappings can now be disabled on any kernel. Suggested-by: H. Peter Anvin Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: I67b818ca390afdd42ab1c27cb4f8ac64bbdb3b65 (cherry picked from commit d2aa1acad22f1bdd0cfa67b3861800e392254454) Signed-off-by: Sami Tolvanen --- Documentation/kernel-parameters.txt | 4 ++++ init/main.c | 27 +++++++++++++++++++++++---- kernel/debug/kdb/kdb_bp.c | 4 +--- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index b02f027cba72..6bb9f95e7690 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3415,6 +3415,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ro [KNL] Mount root device read-only on boot + rodata= [KNL] + on Mark read-only kernel memory as read-only (default). + off Leave read-only kernel memory writable for debugging. + root= [KNL] Root filesystem See name_to_dev_t comment in init/do_mounts.c. diff --git a/init/main.c b/init/main.c index 9e64d7097f1a..fbafa271531c 100644 --- a/init/main.c +++ b/init/main.c @@ -93,9 +93,6 @@ static int kernel_init(void *); extern void init_IRQ(void); extern void fork_init(void); extern void radix_tree_init(void); -#ifndef CONFIG_DEBUG_RODATA -static inline void mark_rodata_ro(void) { } -#endif /* * Debug helper: via this flag we know that we are in 'early bootup code' @@ -929,6 +926,28 @@ static int try_to_run_init_process(const char *init_filename) static noinline void __init kernel_init_freeable(void); +#ifdef CONFIG_DEBUG_RODATA +static bool rodata_enabled = true; +static int __init set_debug_rodata(char *str) +{ + return strtobool(str, &rodata_enabled); +} +__setup("rodata=", set_debug_rodata); + +static void mark_readonly(void) +{ + if (rodata_enabled) + mark_rodata_ro(); + else + pr_info("Kernel memory protection disabled.\n"); +} +#else +static inline void mark_readonly(void) +{ + pr_warn("This architecture does not have kernel memory protection.\n"); +} +#endif + static int __ref kernel_init(void *unused) { int ret; @@ -937,7 +956,7 @@ static int __ref kernel_init(void *unused) /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); free_initmem(); - mark_rodata_ro(); + mark_readonly(); system_state = SYSTEM_RUNNING; numa_default_policy(); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index e1dbf4a2c69e..90ff129c88a2 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) } else { kdb_printf("%s: failed to set breakpoint at 0x%lx\n", __func__, bp->bp_addr); -#ifdef CONFIG_DEBUG_RODATA if (!bp->bp_type) { kdb_printf("Software breakpoints are unavailable.\n" - " Change the kernel CONFIG_DEBUG_RODATA=n\n" + " Boot the kernel with rodata=off\n" " OR use hw breaks: help bph\n"); } -#endif return 1; } return 0; From fa88d4cf695b9c3a0de87afaa32d59a3913cfd52 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:14 -0800 Subject: [PATCH 0460/3220] UPSTREAM: x86/mm: Always enable CONFIG_DEBUG_RODATA and remove the Kconfig option This removes the CONFIG_DEBUG_RODATA option and makes it always enabled. This simplifies the code and also makes it clearer that read-only mapped memory is just as fundamental a security feature in kernel-space as it is in user-space. Suggested-by: Ingo Molnar Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-4-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: I3e79c7c4ead79a81c1445f1b3dd28003517faf18 (cherry picked from commit 9ccaf77cf05915f51231d158abfd5448aedde758) Signed-off-by: Sami Tolvanen --- arch/x86/Kconfig | 3 +++ arch/x86/Kconfig.debug | 18 +++--------------- arch/x86/include/asm/cacheflush.h | 5 ----- arch/x86/include/asm/kvm_para.h | 7 ------- arch/x86/include/asm/sections.h | 2 +- arch/x86/kernel/ftrace.c | 6 +++--- arch/x86/kernel/kgdb.c | 8 ++------ arch/x86/kernel/test_nx.c | 2 -- arch/x86/kernel/test_rodata.c | 2 +- arch/x86/kernel/vmlinux.lds.S | 25 +++++++++++-------------- arch/x86/mm/init_32.c | 3 --- arch/x86/mm/init_64.c | 3 --- arch/x86/mm/pageattr.c | 2 +- 13 files changed, 25 insertions(+), 61 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9e3b1e57499a..7fa0a1830ef6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -307,6 +307,9 @@ config ARCH_SUPPORTS_UPROBES config FIX_EARLYCON_MEM def_bool y +config DEBUG_RODATA + def_bool y + config PGTABLE_LEVELS int default 4 if X86_64 diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 137dfa96aa14..1f6c306a9a00 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -91,28 +91,16 @@ config EFI_PGT_DUMP issues with the mapping of the EFI runtime regions into that table. -config DEBUG_RODATA - bool "Write protect kernel read-only data structures" - default y - depends on DEBUG_KERNEL - ---help--- - Mark the kernel read-only data as write-protected in the pagetables, - in order to catch accidental (and incorrect) writes to such const - data. This is recommended so that we can catch kernel bugs sooner. - If in doubt, say "Y". - config DEBUG_RODATA_TEST - bool "Testcase for the DEBUG_RODATA feature" - depends on DEBUG_RODATA + bool "Testcase for the marking rodata read-only" default y ---help--- - This option enables a testcase for the DEBUG_RODATA - feature as well as for the change_page_attr() infrastructure. + This option enables a testcase for the setting rodata read-only + as well as for the change_page_attr() infrastructure. If in doubt, say "N" config DEBUG_WX bool "Warn on W+X mappings at boot" - depends on DEBUG_RODATA select X86_PTDUMP_CORE ---help--- Generate a warning if any W+X mappings are found at boot. diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index c8cff75c5b21..61518cf79437 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -91,15 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size); #define mmio_flush_range(addr, size) clflush_cache_range(addr, size) -#ifdef CONFIG_DEBUG_RODATA extern const int rodata_test_data; extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); -#else -static inline void set_kernel_text_rw(void) { } -static inline void set_kernel_text_ro(void) { } -#endif #ifdef CONFIG_DEBUG_RODATA_TEST int rodata_test(void); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c1adf33fdd0d..bc62e7cbf1b1 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void) } #endif /* CONFIG_KVM_GUEST */ -#ifdef CONFIG_DEBUG_RODATA #define KVM_HYPERCALL \ ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) -#else -/* On AMD processors, vmcall will generate a trap that we will - * then rewrite to the appropriate instruction. - */ -#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" -#endif /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 0a5242428659..13b6cdd0af57 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -7,7 +7,7 @@ extern char __brk_base[], __brk_limit[]; extern struct exception_table_entry __stop___ex_table[]; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 311bcf338f07..eb6bd34582c6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end) static unsigned long text_ip_addr(unsigned long ip) { /* - * On x86_64, kernel text mappings are mapped read-only with - * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead - * of the kernel text mapping to modify the kernel text. + * On x86_64, kernel text mappings are mapped read-only, so we use + * the kernel identity mapping instead of the kernel text mapping + * to modify the kernel text. * * For 32bit kernels, these mappings are same and we can use * kernel identity mapping to modify code. diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 44256a62702b..ed15cd486d06 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; -#ifdef CONFIG_DEBUG_RODATA char opc[BREAK_INSTR_SIZE]; -#endif /* CONFIG_DEBUG_RODATA */ bpt->type = BP_BREAKPOINT; err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, @@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) return err; err = probe_kernel_write((char *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); -#ifdef CONFIG_DEBUG_RODATA if (!err) return err; /* @@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) return -EINVAL; bpt->type = BP_POKE_BREAKPOINT; -#endif /* CONFIG_DEBUG_RODATA */ + return err; } int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { -#ifdef CONFIG_DEBUG_RODATA int err; char opc[BREAK_INSTR_SIZE]; @@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) goto knl_write; return err; + knl_write: -#endif /* CONFIG_DEBUG_RODATA */ return probe_kernel_write((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 3f92ce07e525..27538f183c3b 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -142,7 +142,6 @@ static int test_NX(void) * by the error message */ -#ifdef CONFIG_DEBUG_RODATA /* Test 3: Check if the .rodata section is executable */ if (rodata_test_data != 0xC3) { printk(KERN_ERR "test_nx: .rodata marker has invalid value\n"); @@ -151,7 +150,6 @@ static int test_NX(void) printk(KERN_ERR "test_nx: .rodata section is executable\n"); ret = -ENODEV; } -#endif #if 0 /* Test 4: Check if the .data section of a module is executable */ diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index 5ecbfe5099da..cb4a01b41e27 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -76,5 +76,5 @@ int rodata_test(void) } MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure"); +MODULE_DESCRIPTION("Testcase for marking rodata as read-only"); MODULE_AUTHOR("Arjan van de Ven "); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 74e4bf11f562..fe133b710bef 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -41,29 +41,28 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) /* - * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA - * we retain large page mappings for boundaries spanning kernel text, rodata - * and data sections. + * On 64-bit, align RODATA to 2MB so we retain large page mappings for + * boundaries spanning kernel text, rodata and data sections. * * However, kernel identity mappings will have different RWX permissions * to the pages mapping to text and to the pages padding (which are freed) the * text section. Hence kernel identity mappings will be broken to smaller * pages. For 64-bit, kernel text and kernel identity mappings are different, - * so we can enable protection checks that come with CONFIG_DEBUG_RODATA, - * as well as retain 2MB large page mappings for kernel text. + * so we can enable protection checks as well as retain 2MB large page + * mappings for kernel text. */ -#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); +#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); -#define X64_ALIGN_DEBUG_RODATA_END \ +#define X64_ALIGN_RODATA_END \ . = ALIGN(HPAGE_SIZE); \ __end_rodata_hpage_align = .; #else -#define X64_ALIGN_DEBUG_RODATA_BEGIN -#define X64_ALIGN_DEBUG_RODATA_END +#define X64_ALIGN_RODATA_BEGIN +#define X64_ALIGN_RODATA_END #endif @@ -112,13 +111,11 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 -#if defined(CONFIG_DEBUG_RODATA) /* .text should occupy whole number of pages */ . = ALIGN(PAGE_SIZE); -#endif - X64_ALIGN_DEBUG_RODATA_BEGIN + X64_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) - X64_ALIGN_DEBUG_RODATA_END + X64_ALIGN_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cb4ef3de61f9..2ebfbaf61142 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -871,7 +871,6 @@ static noinline int do_test_wp_bit(void) return flag; } -#ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); @@ -960,5 +959,3 @@ void mark_rodata_ro(void) if (__supported_pte_mask & _PAGE_NX) debug_checkwx(); } -#endif - diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ec081fe0ce2c..e08d141844ee 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1062,7 +1062,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -#ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); @@ -1154,8 +1153,6 @@ void mark_rodata_ro(void) debug_checkwx(); } -#endif - int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a3137a4feed1..8d77c7f7a614 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -278,7 +278,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; -#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +#if defined(CONFIG_X86_64) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections From bfd3de058f94daa2a2751e7660bb9561fa23d7cd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:15 -0800 Subject: [PATCH 0461/3220] UPSTREAM: arch: Introduce post-init read-only memory One of the easiest ways to protect the kernel from attack is to reduce the internal attack surface exposed when a "write" flaw is available. By making as much of the kernel read-only as possible, we reduce the attack surface. Many things are written to only during __init, and never changed again. These cannot be made "const" since the compiler will do the wrong thing (we do actually need to write to them). Instead, move these items into a memory region that will be made read-only during mark_rodata_ro() which happens after all kernel __init code has finished. This introduces __ro_after_init as a way to mark such memory, and adds some documentation about the existing __read_mostly marking. This improves the security of the Linux kernel by marking formerly read-write memory regions as read-only on a fully booted up system. Based on work by PaX Team and Brad Spengler. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brad Spengler Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-5-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: I640f6d858d9770a5e480d12a1c716adf8842feb0 (cherry picked from commit c74ba8b3480da6ddaea17df2263ec09b869ac496) Signed-off-by: Sami Tolvanen --- arch/parisc/include/asm/cache.h | 3 +++ include/asm-generic/vmlinux.lds.h | 1 + include/linux/cache.h | 14 ++++++++++++++ 3 files changed, 18 insertions(+) diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h index 3d0e17bcc8e9..df0f52bd18b4 100644 --- a/arch/parisc/include/asm/cache.h +++ b/arch/parisc/include/asm/cache.h @@ -22,6 +22,9 @@ #define __read_mostly __attribute__((__section__(".data..read_mostly"))) +/* Read-only memory is marked before mark_rodata_ro() is called. */ +#define __ro_after_init __read_mostly + void parisc_cache_init(void); /* initializes cache-flushing */ void disable_sr_hashing_asm(int); /* low level support for above */ void disable_sr_hashing(void); /* turns off space register hashing */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c4bd0e2c173c..772c784ba763 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -256,6 +256,7 @@ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_rodata) = .; \ *(.rodata) *(.rodata.*) \ + *(.data..ro_after_init) /* Read only after init */ \ *(__vermagic) /* Kernel version magic */ \ . = ALIGN(8); \ VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .; \ diff --git a/include/linux/cache.h b/include/linux/cache.h index 17e7e82d2aa7..1be04f8c563a 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -12,10 +12,24 @@ #define SMP_CACHE_BYTES L1_CACHE_BYTES #endif +/* + * __read_mostly is used to keep rarely changing variables out of frequently + * updated cachelines. If an architecture doesn't support it, ignore the + * hint. + */ #ifndef __read_mostly #define __read_mostly #endif +/* + * __ro_after_init is used to mark things that are read-only after init (i.e. + * after mark_rodata_ro() has been called). These are effectively read-only, + * but may get written to during init, so can't live in .rodata (via "const"). + */ +#ifndef __ro_after_init +#define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) +#endif + #ifndef ____cacheline_aligned #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) #endif From ba24621d7a79e3335f478fead5a84d6ee1a212ce Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:16 -0800 Subject: [PATCH 0462/3220] UPSTREAM: lkdtm: Verify that '__ro_after_init' works correctly The new __ro_after_init section should be writable before init, but not after. Validate that it gets updated at init and can't be written to afterwards. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-6-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: I75301d0497fde49a02f13b4e75300111ddadda9d (cherry picked from commit 7cca071ccbd2a293ea69168ace6abbcdce53098e) Signed-off-by: Sami Tolvanen --- drivers/misc/lkdtm.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c index 11fdadc68e53..2a6eaf1122b4 100644 --- a/drivers/misc/lkdtm.c +++ b/drivers/misc/lkdtm.c @@ -103,6 +103,7 @@ enum ctype { CT_EXEC_USERSPACE, CT_ACCESS_USERSPACE, CT_WRITE_RO, + CT_WRITE_RO_AFTER_INIT, CT_WRITE_KERN, }; @@ -140,6 +141,7 @@ static char* cp_type[] = { "EXEC_USERSPACE", "ACCESS_USERSPACE", "WRITE_RO", + "WRITE_RO_AFTER_INIT", "WRITE_KERN", }; @@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up); static u8 data_area[EXEC_SIZE]; static const unsigned long rodata = 0xAA55AA55; +static unsigned long ro_after_init __ro_after_init = 0x55AA5500; module_param(recur_count, int, 0644); MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test"); @@ -503,11 +506,28 @@ static void lkdtm_do_action(enum ctype which) break; } case CT_WRITE_RO: { - unsigned long *ptr; + /* Explicitly cast away "const" for the test. */ + unsigned long *ptr = (unsigned long *)&rodata; - ptr = (unsigned long *)&rodata; + pr_info("attempting bad rodata write at %p\n", ptr); + *ptr ^= 0xabcd1234; - pr_info("attempting bad write at %p\n", ptr); + break; + } + case CT_WRITE_RO_AFTER_INIT: { + unsigned long *ptr = &ro_after_init; + + /* + * Verify we were written to during init. Since an Oops + * is considered a "success", a failure is to just skip the + * real test. + */ + if ((*ptr & 0xAA) != 0xAA) { + pr_info("%p was NOT written during init!?\n", ptr); + break; + } + + pr_info("attempting bad ro_after_init write at %p\n", ptr); *ptr ^= 0xabcd1234; break; @@ -817,6 +837,9 @@ static int __init lkdtm_module_init(void) int n_debugfs_entries = 1; /* Assume only the direct entry */ int i; + /* Make sure we can write to __ro_after_init values during __init */ + ro_after_init |= 0xAA; + /* Register debugfs interface */ lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL); if (!lkdtm_debugfs_root) { From 60e8ff20653571ccab43f9f4554baaa26436e34b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:17 -0800 Subject: [PATCH 0463/3220] UPSTREAM: x86/vdso: Mark the vDSO code read-only after init The vDSO does not need to be writable after __init, so mark it as __ro_after_init. The result kills the exploit method of writing to the vDSO from kernel space resulting in userspace executing the modified code, as shown here to bypass SMEP restrictions: http://itszn.com/blog/?p=21 The memory map (with added vDSO address reporting) shows the vDSO moving into read-only memory: Before: [ 0.143067] vDSO @ ffffffff82004000 [ 0.143551] vDSO @ ffffffff82006000 ---[ High Kernel Mapping ]--- 0xffffffff80000000-0xffffffff81000000 16M pmd 0xffffffff81000000-0xffffffff81800000 8M ro PSE GLB x pmd 0xffffffff81800000-0xffffffff819f3000 1996K ro GLB x pte 0xffffffff819f3000-0xffffffff81a00000 52K ro NX pte 0xffffffff81a00000-0xffffffff81e00000 4M ro PSE GLB NX pmd 0xffffffff81e00000-0xffffffff81e05000 20K ro GLB NX pte 0xffffffff81e05000-0xffffffff82000000 2028K ro NX pte 0xffffffff82000000-0xffffffff8214f000 1340K RW GLB NX pte 0xffffffff8214f000-0xffffffff82281000 1224K RW NX pte 0xffffffff82281000-0xffffffff82400000 1532K RW GLB NX pte 0xffffffff82400000-0xffffffff83200000 14M RW PSE GLB NX pmd 0xffffffff83200000-0xffffffffc0000000 974M pmd After: [ 0.145062] vDSO @ ffffffff81da1000 [ 0.146057] vDSO @ ffffffff81da4000 ---[ High Kernel Mapping ]--- 0xffffffff80000000-0xffffffff81000000 16M pmd 0xffffffff81000000-0xffffffff81800000 8M ro PSE GLB x pmd 0xffffffff81800000-0xffffffff819f3000 1996K ro GLB x pte 0xffffffff819f3000-0xffffffff81a00000 52K ro NX pte 0xffffffff81a00000-0xffffffff81e00000 4M ro PSE GLB NX pmd 0xffffffff81e00000-0xffffffff81e0b000 44K ro GLB NX pte 0xffffffff81e0b000-0xffffffff82000000 2004K ro NX pte 0xffffffff82000000-0xffffffff8214c000 1328K RW GLB NX pte 0xffffffff8214c000-0xffffffff8227e000 1224K RW NX pte 0xffffffff8227e000-0xffffffff82400000 1544K RW GLB NX pte 0xffffffff82400000-0xffffffff83200000 14M RW PSE GLB NX pmd 0xffffffff83200000-0xffffffffc0000000 974M pmd Based on work by PaX Team and Brad Spengler. Signed-off-by: Kees Cook Acked-by: Andy Lutomirski Acked-by: H. Peter Anvin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brad Spengler Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-7-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: Iafbf7314a1106c297ea883031ee96c4f53c04a2b (cherry picked from commit 018ef8dcf3de5f62e2cc1a9273cc27e1c6ba8de5) Signed-off-by: Sami Tolvanen --- arch/x86/entry/vdso/vdso2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 0224987556ce..3f69326ed545 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, fprintf(outfile, "#include \n"); fprintf(outfile, "\n"); fprintf(outfile, - "static unsigned char raw_data[%lu] __page_aligned_data = {", + "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {", mapping_size); for (j = 0; j < stripped_len; j++) { if (j % 10 == 0) From 3cfedbe672ad7fbd73cb5a073ee6a6f60f8130fa Mon Sep 17 00:00:00 2001 From: David Brown Date: Wed, 17 Feb 2016 14:41:18 -0800 Subject: [PATCH 0464/3220] UPSTREAM: ARM/vdso: Mark the vDSO code read-only after init Although the ARM vDSO is cleanly separated by code/data with the code being read-only in userspace mappings, the code page is still writable from the kernel. There have been exploits (such as http://itszn.com/blog/?p=21) that take advantage of this on x86 to go from a bad kernel write to full root. Prevent this specific exploit class on ARM as well by putting the vDSO code page in post-init read-only memory as well. Before: vdso: 1 text pages at base 80927000 root@Vexpress:/ cat /sys/kernel/debug/kernel_page_tables ---[ Modules ]--- ---[ Kernel Mapping ]--- 0x80000000-0x80100000 1M RW NX SHD 0x80100000-0x80600000 5M ro x SHD 0x80600000-0x80800000 2M ro NX SHD 0x80800000-0xbe000000 984M RW NX SHD After: vdso: 1 text pages at base 8072b000 root@Vexpress:/ cat /sys/kernel/debug/kernel_page_tables ---[ Modules ]--- ---[ Kernel Mapping ]--- 0x80000000-0x80100000 1M RW NX SHD 0x80100000-0x80600000 5M ro x SHD 0x80600000-0x80800000 2M ro NX SHD 0x80800000-0xbe000000 984M RW NX SHD Inspired by https://lkml.org/lkml/2016/1/19/494 based on work by the PaX Team, Brad Spengler, and Kees Cook. Signed-off-by: David Brown Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brad Spengler Cc: Brian Gerst Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: Nathan Lynch Cc: PaX Team Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1455748879-21872-8-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar Bug: 31660652 Change-Id: I8d3cb7707644343aa907b2d584312ccdad63e270 (cherry picked from commit 11bf9b865898961cee60a41c483c9f27ec76e12e) Signed-off-by: Sami Tolvanen --- arch/arm/vdso/vdso.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/vdso/vdso.S b/arch/arm/vdso/vdso.S index b2b97e3e7bab..a62a7b64f49c 100644 --- a/arch/arm/vdso/vdso.S +++ b/arch/arm/vdso/vdso.S @@ -23,9 +23,8 @@ #include #include - __PAGE_ALIGNED_DATA - .globl vdso_start, vdso_end + .section .data..ro_after_init .balign PAGE_SIZE vdso_start: .incbin "arch/arm/vdso/vdso.so" From 609abcd0c940b2fa0b7a548c58665bfb5c02d22b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 7 Jun 2016 12:20:51 +0200 Subject: [PATCH 0465/3220] UPSTREAM: vmlinux.lds.h: allow arch specific handling of ro_after_init data section commit c74ba8b3480d ("arch: Introduce post-init read-only memory") introduced the __ro_after_init attribute which allows to add variables to the ro_after_init data section. This new section was added to rodata, even though it contains writable data. This in turn causes problems on architectures which mark the page table entries read-only that point to rodata very early. This patch allows architectures to implement an own handling of the .data..ro_after_init section. Usually that would be: - mark the rodata section read-only very early - mark the ro_after_init section read-only within mark_rodata_ro Signed-off-by: Heiko Carstens Reviewed-by: Kees Cook Signed-off-by: Martin Schwidefsky Bug: 31660652 Change-Id: If68cb4d86f88678c9bac8c47072775ab85ef5770 (cherry picked from commit 32fb2fc5c357fb99616bbe100dbcb27bc7f5d045) Signed-off-by: Sami Tolvanen --- include/asm-generic/vmlinux.lds.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 772c784ba763..33eb2fe23071 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -248,6 +248,14 @@ . = ALIGN(align); \ *(.data..init_task) +/* + * Allow architectures to handle ro_after_init data on their + * own by defining an empty RO_AFTER_INIT_DATA. + */ +#ifndef RO_AFTER_INIT_DATA +#define RO_AFTER_INIT_DATA *(.data..ro_after_init) +#endif + /* * Read only Data */ @@ -256,7 +264,7 @@ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_rodata) = .; \ *(.rodata) *(.rodata.*) \ - *(.data..ro_after_init) /* Read only after init */ \ + RO_AFTER_INIT_DATA /* Read only after init */ \ *(__vermagic) /* Kernel version magic */ \ . = ALIGN(8); \ VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .; \ From 4055d6b56ac1de5b900ce6f0b484996aa357ee66 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Fri, 23 Sep 2016 10:44:37 -0700 Subject: [PATCH 0466/3220] android-base.cfg: Enable kernel ASLR Bug: 30369029 Change-Id: I0c1c932255866f308d67de1df2ad52c9c19c4799 --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 34fa64a669ac..1ebef5c69fb6 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -139,6 +139,7 @@ CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y CONFIG_PROFILING=y CONFIG_QUOTA=y +CONFIG_RANDOMIZE_BASE=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECCOMP=y From 494a59a68340b0244cd6edf7a367e9233b006a3f Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 30 Aug 2016 17:19:13 -0400 Subject: [PATCH 0467/3220] BACKPORT: audit: consistently record PIDs with task_tgid_nr() Unfortunately we record PIDs in audit records using a variety of methods despite the correct way being the use of task_tgid_nr(). This patch converts all of these callers, except for the case of AUDIT_SET in audit_receive_msg() (see the comment in the code). Reported-by: Jeff Vander Stoep Signed-off-by: Paul Moore Bug: 28952093 (cherry picked from commit fa2bea2f5cca5b8d4a3e5520d2e8c0ede67ac108) Signed-off-by: Jeff Vander Stoep Change-Id: If6645f9de8bc58ed9755f28dc6af5fbf08d72a00 --- kernel/audit.c | 8 +++++++- kernel/auditsc.c | 12 ++++++------ security/lsm_audit.c | 4 ++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/kernel/audit.c b/kernel/audit.c index 5ffcbd354a52..34f690b9213a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -870,6 +870,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } if (s.mask & AUDIT_STATUS_PID) { + /* NOTE: we are using task_tgid_vnr() below because + * the s.pid value is relative to the namespace + * of the caller; at present this doesn't matter + * much since you can really only run auditd + * from the initial pid namespace, but something + * to keep in mind if this changes */ int new_pid = s.pid; if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) @@ -1896,7 +1902,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(tsk), - task_pid_nr(tsk), + task_tgid_nr(tsk), from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 48f45987dc6c..63f0e495f517 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -458,7 +458,7 @@ static int audit_filter_rules(struct task_struct *tsk, switch (f->type) { case AUDIT_PID: - pid = task_pid_nr(tsk); + pid = task_tgid_nr(tsk); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: @@ -1987,7 +1987,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; - audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d", oldloginuid, loginuid, oldsessionid, sessionid, !rc); @@ -2212,7 +2212,7 @@ void __audit_ptrace(struct task_struct *t) { struct audit_context *context = current->audit_context; - context->target_pid = task_pid_nr(t); + context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); @@ -2237,7 +2237,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = task_pid_nr(tsk); + audit_sig_pid = task_tgid_nr(tsk); if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else @@ -2337,7 +2337,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = current->audit_context; - context->capset.pid = task_pid_nr(current); + context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; @@ -2369,7 +2369,7 @@ static void audit_log_task(struct audit_buffer *ab) from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); + audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); } diff --git a/security/lsm_audit.c b/security/lsm_audit.c index cccbf3068cdc..45d927ab807d 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -220,7 +220,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, */ BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2); - audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); + audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, memcpy(comm, current->comm, sizeof(comm))); switch (a->type) { @@ -294,7 +294,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, case LSM_AUDIT_DATA_TASK: { struct task_struct *tsk = a->u.tsk; if (tsk) { - pid_t pid = task_pid_nr(tsk); + pid_t pid = task_tgid_nr(tsk); if (pid) { char comm[sizeof(tsk->comm)]; audit_log_format(ab, " opid=%d ocomm=", pid); From 1daecfe215ea3379fdc842f264607d37dd8fe091 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Tue, 27 Sep 2016 13:48:29 -0700 Subject: [PATCH 0468/3220] ANDROID: dm: android-verity: Remove fec_header location constraint This CL removes the mandate of the fec_header being located right after the ECC data. (Cherry-picked from https://android-review.googlesource.com/#/c/280401) Bug: 28865197 Signed-off-by: Badhri Jagan Sridharan Change-Id: Ie04c8cf2dd755f54d02dbdc4e734a13d6f6507b5 --- drivers/md/dm-android-verity.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index 15ce2a81c1f4..bb6c1285e499 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -266,10 +266,7 @@ static inline int validate_fec_header(struct fec_header *header, u64 offset) le32_to_cpu(header->version) != FEC_VERSION || le32_to_cpu(header->size) != sizeof(struct fec_header) || le32_to_cpu(header->roots) == 0 || - le32_to_cpu(header->roots) >= FEC_RSM || - offset < le32_to_cpu(header->fec_size) || - offset - le32_to_cpu(header->fec_size) != - le64_to_cpu(header->inp_size)) + le32_to_cpu(header->roots) >= FEC_RSM) return -EINVAL; return 0; From bb849676d5a9351c7d263c20b5ef9e67ec463412 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:15 +0100 Subject: [PATCH 0469/3220] UPSTREAM: mm/memblock: add MEMBLOCK_NOMAP attribute to memblock memory table This introduces the MEMBLOCK_NOMAP attribute and the required plumbing to make it usable as an indicator that some parts of normal memory should not be covered by the kernel direct mapping. It is up to the arch to actually honor the attribute when laying out this mapping, but the memblock code itself is modified to disregard these regions for allocations and other general use. Cc: linux-mm@kvack.org Cc: Alexander Kuleshov Cc: Andrew Morton Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: I55cd3abdf514ac54c071fa0037d8dac73bda798d (cherry picked from commit bf3d3cc580f9960883ebf9ea05868f336d9491c2) Signed-off-by: Sami Tolvanen --- include/linux/memblock.h | 8 ++++++++ mm/memblock.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 24daf8fc4d7c..fec66f86eeff 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -25,6 +25,7 @@ enum { MEMBLOCK_NONE = 0x0, /* No special request */ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ MEMBLOCK_MIRROR = 0x2, /* mirrored region */ + MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ }; struct memblock_region { @@ -82,6 +83,7 @@ bool memblock_overlaps_region(struct memblock_type *type, int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); +int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); ulong choose_memblock_flags(void); /* Low level functions */ @@ -184,6 +186,11 @@ static inline bool memblock_is_mirror(struct memblock_region *m) return m->flags & MEMBLOCK_MIRROR; } +static inline bool memblock_is_nomap(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_NOMAP; +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); @@ -319,6 +326,7 @@ phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); int memblock_is_memory(phys_addr_t addr); +int memblock_is_map_memory(phys_addr_t addr); int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); int memblock_is_reserved(phys_addr_t addr); bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); diff --git a/mm/memblock.c b/mm/memblock.c index d300f1329814..07ff069fef25 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -822,6 +822,17 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); } +/** + * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on success, -errno on failure. + */ +int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); +} /** * __next_reserved_mem_region - next function for for_each_reserved_region() @@ -913,6 +924,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) continue; + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1022,6 +1037,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) continue; + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1519,6 +1538,15 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) return memblock_search(&memblock.memory, addr) != -1; } +int __init_memblock memblock_is_map_memory(phys_addr_t addr) +{ + int i = memblock_search(&memblock.memory, addr); + + if (i == -1) + return false; + return !memblock_is_nomap(&memblock.memory.regions[i]); +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) From b6535fe666034be99b777a0dfad672636e17de35 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:16 +0100 Subject: [PATCH 0470/3220] BACKPORT: arm64: only consider memblocks with NOMAP cleared for linear mapping Take the new memblock attribute MEMBLOCK_NOMAP into account when deciding whether a certain region is or should be covered by the kernel direct mapping. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: Id7346a09bb3aee5e9a5ef8812251f80cf8265532 (cherry picked from commit 68709f45385aeddb0ca96a060c0c8259944f321b) Signed-off-by: Sami Tolvanen --- arch/arm64/mm/init.c | 2 +- arch/arm64/mm/mmu.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 2c38be3df4c8..08aa45ff05b6 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -131,7 +131,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) int pfn_valid(unsigned long pfn) { - return (pfn & PFN_MASK) == pfn && memblock_is_memory(pfn << PAGE_SHIFT); + return (pfn & PFN_MASK) == pfn && memblock_is_map_memory(pfn << PAGE_SHIFT); } EXPORT_SYMBOL(pfn_valid); #endif diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 72e634533a3f..9dafcfde7c26 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -438,6 +438,8 @@ static void __init map_mem(pgd_t *pgd) if (start >= end) break; + if (memblock_is_nomap(reg)) + continue; __map_memblock(pgd, start, end); } From 4ff6ae812864e5122da39ca70072987ce047f374 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:17 +0100 Subject: [PATCH 0471/3220] UPSTREAM: arm64/efi: mark UEFI reserved regions as MEMBLOCK_NOMAP Change the EFI memory reservation logic to use memblock_mark_nomap() rather than memblock_reserve() to mark UEFI reserved regions as occupied. In addition to reserving them against allocations done by memblock, this will also prevent them from being covered by the linear mapping. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: Ia3ce78f40f8d41a9afdd42238fe9cbfd81bbff08 (cherry picked from commit 4dffbfc48d65e5d8157a634fd670065d237a9377) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/efi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 4eeb17198cfa..04531d35f1df 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -187,7 +187,7 @@ static __init void reserve_regions(void) early_init_dt_add_memory_arch(paddr, size); if (is_reserve_region(md)) { - memblock_reserve(paddr, size); + memblock_mark_nomap(paddr, size); if (efi_enabled(EFI_DBG)) pr_cont("*"); } @@ -209,8 +209,6 @@ void __init efi_init(void) efi_system_table = params.system_table; - memblock_reserve(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + (params.mmap & ~PAGE_MASK))); memmap.phys_map = params.mmap; memmap.map = early_memremap(params.mmap, params.mmap_size); if (memmap.map == NULL) { @@ -230,6 +228,9 @@ void __init efi_init(void) reserve_regions(); early_memunmap(memmap.map, params.mmap_size); + memblock_mark_nomap(params.mmap & PAGE_MASK, + PAGE_ALIGN(params.mmap_size + + (params.mmap & ~PAGE_MASK))); } static bool __init efi_virtmap_init(void) From 0467f02fb679b2de7c583af61cb6e48aca8237fc Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:18 +0100 Subject: [PATCH 0472/3220] UPSTREAM: arm64/efi: split off EFI init and runtime code for reuse by 32-bit ARM This splits off the early EFI init and runtime code that - discovers the EFI params and the memory map from the FDT, and installs the memblocks and config tables. - prepares and installs the EFI page tables so that UEFI Runtime Services can be invoked at the virtual address installed by the stub. This will allow it to be reused for 32-bit ARM. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: I143e4b38a5426f70027eff6cc5f732ac370ae69d (cherry picked from commit e5bc22a42e4d46cc203fdfb6d2c76202b08666a0) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/efi.c | 326 +---------------------------- drivers/firmware/efi/Makefile | 3 + drivers/firmware/efi/arm-init.c | 208 ++++++++++++++++++ drivers/firmware/efi/arm-runtime.c | 151 +++++++++++++ 4 files changed, 363 insertions(+), 325 deletions(-) create mode 100644 drivers/firmware/efi/arm-init.c create mode 100644 drivers/firmware/efi/arm-runtime.c diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 04531d35f1df..bd3b2f5adf0c 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -11,318 +11,11 @@ * */ -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include -#include #include -#include -#include -#include -#include - -struct efi_memory_map memmap; - -static u64 efi_system_table; - -static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss; - -static struct mm_struct efi_mm = { - .mm_rb = RB_ROOT, - .pgd = efi_pgd, - .mm_users = ATOMIC_INIT(2), - .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), - .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), - .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), -}; - -static int __init is_normal_ram(efi_memory_desc_t *md) -{ - if (md->attribute & EFI_MEMORY_WB) - return 1; - return 0; -} - -/* - * Translate a EFI virtual address into a physical address: this is necessary, - * as some data members of the EFI system table are virtually remapped after - * SetVirtualAddressMap() has been called. - */ -static phys_addr_t efi_to_phys(unsigned long addr) -{ - efi_memory_desc_t *md; - - for_each_efi_memory_desc(&memmap, md) { - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - if (md->virt_addr == 0) - /* no virtual mapping has been installed by the stub */ - break; - if (md->virt_addr <= addr && - (addr - md->virt_addr) < (md->num_pages << EFI_PAGE_SHIFT)) - return md->phys_addr + addr - md->virt_addr; - } - return addr; -} - -static int __init uefi_init(void) -{ - efi_char16_t *c16; - void *config_tables; - u64 table_size; - char vendor[100] = "unknown"; - int i, retval; - - efi.systab = early_memremap(efi_system_table, - sizeof(efi_system_table_t)); - if (efi.systab == NULL) { - pr_warn("Unable to map EFI system table.\n"); - return -ENOMEM; - } - - set_bit(EFI_BOOT, &efi.flags); - set_bit(EFI_64BIT, &efi.flags); - - /* - * Verify the EFI Table - */ - if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { - pr_err("System table signature incorrect\n"); - retval = -EINVAL; - goto out; - } - if ((efi.systab->hdr.revision >> 16) < 2) - pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff); - - /* Show what we know for posterity */ - c16 = early_memremap(efi_to_phys(efi.systab->fw_vendor), - sizeof(vendor) * sizeof(efi_char16_t)); - if (c16) { - for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i) - vendor[i] = c16[i]; - vendor[i] = '\0'; - early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t)); - } - - pr_info("EFI v%u.%.02u by %s\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); - - table_size = sizeof(efi_config_table_64_t) * efi.systab->nr_tables; - config_tables = early_memremap(efi_to_phys(efi.systab->tables), - table_size); - if (config_tables == NULL) { - pr_warn("Unable to map EFI config table array.\n"); - retval = -ENOMEM; - goto out; - } - retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, - sizeof(efi_config_table_64_t), NULL); - - early_memunmap(config_tables, table_size); -out: - early_memunmap(efi.systab, sizeof(efi_system_table_t)); - return retval; -} - -/* - * Return true for RAM regions we want to permanently reserve. - */ -static __init int is_reserve_region(efi_memory_desc_t *md) -{ - switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: - case EFI_BOOT_SERVICES_CODE: - case EFI_BOOT_SERVICES_DATA: - case EFI_CONVENTIONAL_MEMORY: - case EFI_PERSISTENT_MEMORY: - return 0; - default: - break; - } - return is_normal_ram(md); -} - -static __init void reserve_regions(void) -{ - efi_memory_desc_t *md; - u64 paddr, npages, size; - - if (efi_enabled(EFI_DBG)) - pr_info("Processing EFI memory map:\n"); - - for_each_efi_memory_desc(&memmap, md) { - paddr = md->phys_addr; - npages = md->num_pages; - - if (efi_enabled(EFI_DBG)) { - char buf[64]; - - pr_info(" 0x%012llx-0x%012llx %s", - paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1, - efi_md_typeattr_format(buf, sizeof(buf), md)); - } - - memrange_efi_to_native(&paddr, &npages); - size = npages << PAGE_SHIFT; - - if (is_normal_ram(md)) - early_init_dt_add_memory_arch(paddr, size); - - if (is_reserve_region(md)) { - memblock_mark_nomap(paddr, size); - if (efi_enabled(EFI_DBG)) - pr_cont("*"); - } - - if (efi_enabled(EFI_DBG)) - pr_cont("\n"); - } - - set_bit(EFI_MEMMAP, &efi.flags); -} - -void __init efi_init(void) -{ - struct efi_fdt_params params; - - /* Grab UEFI information placed in FDT by stub */ - if (!efi_get_fdt_params(¶ms)) - return; - - efi_system_table = params.system_table; - - memmap.phys_map = params.mmap; - memmap.map = early_memremap(params.mmap, params.mmap_size); - if (memmap.map == NULL) { - /* - * If we are booting via UEFI, the UEFI memory map is the only - * description of memory we have, so there is little point in - * proceeding if we cannot access it. - */ - panic("Unable to map EFI memory map.\n"); - } - memmap.map_end = memmap.map + params.mmap_size; - memmap.desc_size = params.desc_size; - memmap.desc_version = params.desc_ver; - - if (uefi_init() < 0) - return; - - reserve_regions(); - early_memunmap(memmap.map, params.mmap_size); - memblock_mark_nomap(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + - (params.mmap & ~PAGE_MASK))); -} - -static bool __init efi_virtmap_init(void) -{ - efi_memory_desc_t *md; - - init_new_context(NULL, &efi_mm); - - for_each_efi_memory_desc(&memmap, md) { - pgprot_t prot; - - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - if (md->virt_addr == 0) - return false; - - pr_info(" EFI remap 0x%016llx => %p\n", - md->phys_addr, (void *)md->virt_addr); - - /* - * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be - * executable, everything else can be mapped with the XN bits - * set. - */ - if (!is_normal_ram(md)) - prot = __pgprot(PROT_DEVICE_nGnRE); - else if (md->type == EFI_RUNTIME_SERVICES_CODE || - !PAGE_ALIGNED(md->phys_addr)) - prot = PAGE_KERNEL_EXEC; - else - prot = PAGE_KERNEL; - - create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr, - md->num_pages << EFI_PAGE_SHIFT, - __pgprot(pgprot_val(prot) | PTE_NG)); - } - return true; -} - -/* - * Enable the UEFI Runtime Services if all prerequisites are in place, i.e., - * non-early mapping of the UEFI system table and virtual mappings for all - * EFI_MEMORY_RUNTIME regions. - */ -static int __init arm64_enable_runtime_services(void) -{ - u64 mapsize; - - if (!efi_enabled(EFI_BOOT)) { - pr_info("EFI services will not be available.\n"); - return 0; - } - - if (efi_runtime_disabled()) { - pr_info("EFI runtime services will be disabled.\n"); - return 0; - } - - pr_info("Remapping and enabling EFI services.\n"); - - mapsize = memmap.map_end - memmap.map; - memmap.map = (__force void *)ioremap_cache(memmap.phys_map, - mapsize); - if (!memmap.map) { - pr_err("Failed to remap EFI memory map\n"); - return -ENOMEM; - } - memmap.map_end = memmap.map + mapsize; - efi.memmap = &memmap; - - efi.systab = (__force void *)ioremap_cache(efi_system_table, - sizeof(efi_system_table_t)); - if (!efi.systab) { - pr_err("Failed to remap EFI System Table\n"); - return -ENOMEM; - } - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - - if (!efi_virtmap_init()) { - pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); - return -ENOMEM; - } - - /* Set up runtime services function pointers */ - efi_native_runtime_setup(); - set_bit(EFI_RUNTIME_SERVICES, &efi.flags); - - efi.runtime_version = efi.systab->hdr.revision; - - return 0; -} -early_initcall(arm64_enable_runtime_services); static int __init arm64_dmi_init(void) { @@ -338,23 +31,6 @@ static int __init arm64_dmi_init(void) } core_initcall(arm64_dmi_init); -static void efi_set_pgd(struct mm_struct *mm) -{ - switch_mm(NULL, mm, NULL); -} - -void efi_virtmap_load(void) -{ - preempt_disable(); - efi_set_pgd(&efi_mm); -} - -void efi_virtmap_unload(void) -{ - efi_set_pgd(current->active_mm); - preempt_enable(); -} - /* * UpdateCapsule() depends on the system being shutdown via * ResetSystem(). diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index ec379a4164cc..f292917b00e7 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -18,3 +18,6 @@ obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o obj-$(CONFIG_EFI_RUNTIME_WRAPPERS) += runtime-wrappers.o obj-$(CONFIG_EFI_STUB) += libstub/ obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o + +arm-obj-$(CONFIG_EFI) := arm-init.o arm-runtime.o +obj-$(CONFIG_ARM64) += $(arm-obj-y) diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c new file mode 100644 index 000000000000..ffdd76a51929 --- /dev/null +++ b/drivers/firmware/efi/arm-init.c @@ -0,0 +1,208 @@ +/* + * Extensible Firmware Interface + * + * Based on Extensible Firmware Interface Specification version 2.4 + * + * Copyright (C) 2013 - 2015 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +struct efi_memory_map memmap; + +u64 efi_system_table; + +static int __init is_normal_ram(efi_memory_desc_t *md) +{ + if (md->attribute & EFI_MEMORY_WB) + return 1; + return 0; +} + +/* + * Translate a EFI virtual address into a physical address: this is necessary, + * as some data members of the EFI system table are virtually remapped after + * SetVirtualAddressMap() has been called. + */ +static phys_addr_t efi_to_phys(unsigned long addr) +{ + efi_memory_desc_t *md; + + for_each_efi_memory_desc(&memmap, md) { + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + if (md->virt_addr == 0) + /* no virtual mapping has been installed by the stub */ + break; + if (md->virt_addr <= addr && + (addr - md->virt_addr) < (md->num_pages << EFI_PAGE_SHIFT)) + return md->phys_addr + addr - md->virt_addr; + } + return addr; +} + +static int __init uefi_init(void) +{ + efi_char16_t *c16; + void *config_tables; + u64 table_size; + char vendor[100] = "unknown"; + int i, retval; + + efi.systab = early_memremap(efi_system_table, + sizeof(efi_system_table_t)); + if (efi.systab == NULL) { + pr_warn("Unable to map EFI system table.\n"); + return -ENOMEM; + } + + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + + /* + * Verify the EFI Table + */ + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { + pr_err("System table signature incorrect\n"); + retval = -EINVAL; + goto out; + } + if ((efi.systab->hdr.revision >> 16) < 2) + pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff); + + /* Show what we know for posterity */ + c16 = early_memremap(efi_to_phys(efi.systab->fw_vendor), + sizeof(vendor) * sizeof(efi_char16_t)); + if (c16) { + for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i) + vendor[i] = c16[i]; + vendor[i] = '\0'; + early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t)); + } + + pr_info("EFI v%u.%.02u by %s\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + table_size = sizeof(efi_config_table_64_t) * efi.systab->nr_tables; + config_tables = early_memremap(efi_to_phys(efi.systab->tables), + table_size); + if (config_tables == NULL) { + pr_warn("Unable to map EFI config table array.\n"); + retval = -ENOMEM; + goto out; + } + retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, + sizeof(efi_config_table_64_t), NULL); + + early_memunmap(config_tables, table_size); +out: + early_memunmap(efi.systab, sizeof(efi_system_table_t)); + return retval; +} + +/* + * Return true for RAM regions we want to permanently reserve. + */ +static __init int is_reserve_region(efi_memory_desc_t *md) +{ + switch (md->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + case EFI_PERSISTENT_MEMORY: + return 0; + default: + break; + } + return is_normal_ram(md); +} + +static __init void reserve_regions(void) +{ + efi_memory_desc_t *md; + u64 paddr, npages, size; + + if (efi_enabled(EFI_DBG)) + pr_info("Processing EFI memory map:\n"); + + for_each_efi_memory_desc(&memmap, md) { + paddr = md->phys_addr; + npages = md->num_pages; + + if (efi_enabled(EFI_DBG)) { + char buf[64]; + + pr_info(" 0x%012llx-0x%012llx %s", + paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1, + efi_md_typeattr_format(buf, sizeof(buf), md)); + } + + memrange_efi_to_native(&paddr, &npages); + size = npages << PAGE_SHIFT; + + if (is_normal_ram(md)) + early_init_dt_add_memory_arch(paddr, size); + + if (is_reserve_region(md)) { + memblock_mark_nomap(paddr, size); + if (efi_enabled(EFI_DBG)) + pr_cont("*"); + } + + if (efi_enabled(EFI_DBG)) + pr_cont("\n"); + } + + set_bit(EFI_MEMMAP, &efi.flags); +} + +void __init efi_init(void) +{ + struct efi_fdt_params params; + + /* Grab UEFI information placed in FDT by stub */ + if (!efi_get_fdt_params(¶ms)) + return; + + efi_system_table = params.system_table; + + memmap.phys_map = params.mmap; + memmap.map = early_memremap(params.mmap, params.mmap_size); + if (memmap.map == NULL) { + /* + * If we are booting via UEFI, the UEFI memory map is the only + * description of memory we have, so there is little point in + * proceeding if we cannot access it. + */ + panic("Unable to map EFI memory map.\n"); + } + memmap.map_end = memmap.map + params.mmap_size; + memmap.desc_size = params.desc_size; + memmap.desc_version = params.desc_ver; + + if (uefi_init() < 0) + return; + + reserve_regions(); + early_memunmap(memmap.map, params.mmap_size); + memblock_mark_nomap(params.mmap & PAGE_MASK, + PAGE_ALIGN(params.mmap_size + + (params.mmap & ~PAGE_MASK))); +} diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c new file mode 100644 index 000000000000..974743e13a4d --- /dev/null +++ b/drivers/firmware/efi/arm-runtime.c @@ -0,0 +1,151 @@ +/* + * Extensible Firmware Interface + * + * Based on Extensible Firmware Interface Specification version 2.4 + * + * Copyright (C) 2013, 2014 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss; + +extern u64 efi_system_table; + +static struct mm_struct efi_mm = { + .mm_rb = RB_ROOT, + .pgd = efi_pgd, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), +}; + +static bool __init efi_virtmap_init(void) +{ + efi_memory_desc_t *md; + + init_new_context(NULL, &efi_mm); + + for_each_efi_memory_desc(&memmap, md) { + pgprot_t prot; + + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + if (md->virt_addr == 0) + return false; + + pr_info(" EFI remap 0x%016llx => %p\n", + md->phys_addr, (void *)md->virt_addr); + + /* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. + */ + if ((md->attribute & EFI_MEMORY_WB) == 0) + prot = __pgprot(PROT_DEVICE_nGnRE); + else if (md->type == EFI_RUNTIME_SERVICES_CODE || + !PAGE_ALIGNED(md->phys_addr)) + prot = PAGE_KERNEL_EXEC; + else + prot = PAGE_KERNEL; + + create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr, + md->num_pages << EFI_PAGE_SHIFT, + __pgprot(pgprot_val(prot) | PTE_NG)); + } + return true; +} + +/* + * Enable the UEFI Runtime Services if all prerequisites are in place, i.e., + * non-early mapping of the UEFI system table and virtual mappings for all + * EFI_MEMORY_RUNTIME regions. + */ +static int __init arm64_enable_runtime_services(void) +{ + u64 mapsize; + + if (!efi_enabled(EFI_BOOT)) { + pr_info("EFI services will not be available.\n"); + return 0; + } + + if (efi_runtime_disabled()) { + pr_info("EFI runtime services will be disabled.\n"); + return 0; + } + + pr_info("Remapping and enabling EFI services.\n"); + + mapsize = memmap.map_end - memmap.map; + memmap.map = (__force void *)ioremap_cache(memmap.phys_map, + mapsize); + if (!memmap.map) { + pr_err("Failed to remap EFI memory map\n"); + return -ENOMEM; + } + memmap.map_end = memmap.map + mapsize; + efi.memmap = &memmap; + + efi.systab = (__force void *)ioremap_cache(efi_system_table, + sizeof(efi_system_table_t)); + if (!efi.systab) { + pr_err("Failed to remap EFI System Table\n"); + return -ENOMEM; + } + set_bit(EFI_SYSTEM_TABLES, &efi.flags); + + if (!efi_virtmap_init()) { + pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); + return -ENOMEM; + } + + /* Set up runtime services function pointers */ + efi_native_runtime_setup(); + set_bit(EFI_RUNTIME_SERVICES, &efi.flags); + + efi.runtime_version = efi.systab->hdr.revision; + + return 0; +} +early_initcall(arm64_enable_runtime_services); + +static void efi_set_pgd(struct mm_struct *mm) +{ + switch_mm(NULL, mm, NULL); +} + +void efi_virtmap_load(void) +{ + preempt_disable(); + efi_set_pgd(&efi_mm); +} + +void efi_virtmap_unload(void) +{ + efi_set_pgd(current->active_mm); + preempt_enable(); +} From e5ef8437e39123daae78dbd9010c45226681d076 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:19 +0100 Subject: [PATCH 0473/3220] UPSTREAM: arm64/efi: refactor EFI init and runtime code for reuse by 32-bit ARM This refactors the EFI init and runtime code that will be shared between arm64 and ARM so that it can be built for both archs. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: Ieee70bbe117170d2054a9c82c4f1a8143b7e302b (cherry picked from commit f7d924894265794f447ea799dd853400749b5a22) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/efi.h | 9 ++++++ arch/arm64/kernel/efi.c | 23 ++++++++++++++ drivers/firmware/efi/arm-init.c | 7 +++-- drivers/firmware/efi/arm-runtime.c | 48 ++++++++++-------------------- drivers/firmware/efi/efi.c | 2 ++ 5 files changed, 54 insertions(+), 35 deletions(-) diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index ef572206f1c3..8e88a696c9cb 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -2,7 +2,9 @@ #define _ASM_EFI_H #include +#include #include +#include #ifdef CONFIG_EFI extern void efi_init(void); @@ -10,6 +12,8 @@ extern void efi_init(void); #define efi_init() #endif +int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); + #define efi_call_virt(f, ...) \ ({ \ efi_##f##_t *__f; \ @@ -63,6 +67,11 @@ extern void efi_init(void); * Services are enabled and the EFI_RUNTIME_SERVICES bit set. */ +static inline void efi_set_pgd(struct mm_struct *mm) +{ + switch_mm(NULL, mm, NULL); +} + void efi_virtmap_load(void); void efi_virtmap_unload(void); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index bd3b2f5adf0c..b6abc852f2a1 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -17,6 +17,29 @@ #include +int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +{ + pteval_t prot_val; + + /* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. + */ + if ((md->attribute & EFI_MEMORY_WB) == 0) + prot_val = PROT_DEVICE_nGnRE; + else if (md->type == EFI_RUNTIME_SERVICES_CODE || + !PAGE_ALIGNED(md->phys_addr)) + prot_val = pgprot_val(PAGE_KERNEL_EXEC); + else + prot_val = pgprot_val(PAGE_KERNEL); + + create_pgd_mapping(mm, md->phys_addr, md->virt_addr, + md->num_pages << EFI_PAGE_SHIFT, + __pgprot(prot_val | PTE_NG)); + return 0; +} + static int __init arm64_dmi_init(void) { /* diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index ffdd76a51929..9e15d571b53c 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -57,7 +57,7 @@ static int __init uefi_init(void) { efi_char16_t *c16; void *config_tables; - u64 table_size; + size_t table_size; char vendor[100] = "unknown"; int i, retval; @@ -69,7 +69,8 @@ static int __init uefi_init(void) } set_bit(EFI_BOOT, &efi.flags); - set_bit(EFI_64BIT, &efi.flags); + if (IS_ENABLED(CONFIG_64BIT)) + set_bit(EFI_64BIT, &efi.flags); /* * Verify the EFI Table @@ -107,7 +108,7 @@ static int __init uefi_init(void) goto out; } retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, - sizeof(efi_config_table_64_t), NULL); + sizeof(efi_config_table_t), NULL); early_memunmap(config_tables, table_size); out: diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 974743e13a4d..6ae21e41a429 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -23,18 +24,14 @@ #include #include -#include -#include #include +#include #include -static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss; - extern u64 efi_system_table; static struct mm_struct efi_mm = { .mm_rb = RB_ROOT, - .pgd = efi_pgd, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), @@ -46,35 +43,27 @@ static bool __init efi_virtmap_init(void) { efi_memory_desc_t *md; + efi_mm.pgd = pgd_alloc(&efi_mm); init_new_context(NULL, &efi_mm); for_each_efi_memory_desc(&memmap, md) { - pgprot_t prot; + phys_addr_t phys = md->phys_addr; + int ret; if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; if (md->virt_addr == 0) return false; - pr_info(" EFI remap 0x%016llx => %p\n", - md->phys_addr, (void *)md->virt_addr); - - /* - * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be - * executable, everything else can be mapped with the XN bits - * set. - */ - if ((md->attribute & EFI_MEMORY_WB) == 0) - prot = __pgprot(PROT_DEVICE_nGnRE); - else if (md->type == EFI_RUNTIME_SERVICES_CODE || - !PAGE_ALIGNED(md->phys_addr)) - prot = PAGE_KERNEL_EXEC; - else - prot = PAGE_KERNEL; - - create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr, - md->num_pages << EFI_PAGE_SHIFT, - __pgprot(pgprot_val(prot) | PTE_NG)); + ret = efi_create_mapping(&efi_mm, md); + if (!ret) { + pr_info(" EFI remap %pa => %p\n", + &phys, (void *)(unsigned long)md->virt_addr); + } else { + pr_warn(" EFI remap %pa: failed to create mapping (%d)\n", + &phys, ret); + return false; + } } return true; } @@ -84,7 +73,7 @@ static bool __init efi_virtmap_init(void) * non-early mapping of the UEFI system table and virtual mappings for all * EFI_MEMORY_RUNTIME regions. */ -static int __init arm64_enable_runtime_services(void) +static int __init arm_enable_runtime_services(void) { u64 mapsize; @@ -131,12 +120,7 @@ static int __init arm64_enable_runtime_services(void) return 0; } -early_initcall(arm64_enable_runtime_services); - -static void efi_set_pgd(struct mm_struct *mm) -{ - switch_mm(NULL, mm, NULL); -} +early_initcall(arm_enable_runtime_services); void efi_virtmap_load(void) { diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 027ca212179f..cffa89b3317b 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -25,6 +25,8 @@ #include #include +#include + struct efi __read_mostly efi = { .mps = EFI_INVALID_TABLE_ADDR, .acpi = EFI_INVALID_TABLE_ADDR, From d5f46e4b247971c9c10f902151dcf7df8c750b1c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 5 Nov 2015 15:09:17 +0000 Subject: [PATCH 0474/3220] UPSTREAM: arm64: Add macros to read/write system registers Rather than crafting custom macros for reading/writing each system register provide generics accessors, read_sysreg and write_sysreg, for this purpose. Signed-off-by: Mark Rutland Acked-by: Catalin Marinas Cc: Suzuki Poulose Cc: Will Deacon Signed-off-by: Marc Zyngier Change-Id: I1d6cf948bc6660dfd096ff5a18eba682941098c1 (cherry picked from commit 3600c2fdc09a43a30909743569e35a29121602ed) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/sysreg.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index b9fd8ec79033..1a78d6e2a78b 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -20,6 +20,8 @@ #ifndef __ASM_SYSREG_H #define __ASM_SYSREG_H +#include + #include /* @@ -215,6 +217,8 @@ #else +#include + asm( " .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" " .equ .L__reg_num_x\\num, \\num\n" @@ -239,6 +243,23 @@ static inline void config_sctlr_el1(u32 clear, u32 set) val |= set; asm volatile("msr sctlr_el1, %0" : : "r" (val)); } + +/* + * Unlike read_cpuid, calls to read_sysreg are never expected to be + * optimized away or replaced with synthetic values. + */ +#define read_sysreg(r) ({ \ + u64 __val; \ + asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \ + __val; \ +}) + +#define write_sysreg(v, r) do { \ + u64 __val = (u64)v; \ + asm volatile("msr " __stringify(r) ", %0" \ + : : "r" (__val)); \ +} while (0) + #endif #endif /* __ASM_SYSREG_H */ From 3e64a8612dfa8738b95ad904a454b859ec9d9174 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Wed, 24 Feb 2016 17:44:57 -0800 Subject: [PATCH 0475/3220] UPSTREAM: arm64: Add workaround for Cavium erratum 27456 On ThunderX T88 pass 1.x through 2.1 parts, broadcast TLBI instructions may cause the icache to become corrupted if it contains data for a non-current ASID. This patch implements the workaround (which invalidates the local icache when switching the mm) by using code patching. Signed-off-by: Andrew Pinski Signed-off-by: David Daney Reviewed-by: Will Deacon Signed-off-by: Catalin Marinas Change-Id: I60e6d17926b067a4e022d7b159e239114303a547 (cherry picked from commit 104a0c02e8b1936c049e18a6d4e4ab040fb61213) Signed-off-by: Sami Tolvanen --- Documentation/arm64/silicon-errata.txt | 1 + arch/arm64/Kconfig | 11 +++++++++++ arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/kernel/cpu_errata.c | 9 +++++++++ arch/arm64/mm/proc.S | 12 ++++++++++++ 5 files changed, 35 insertions(+), 1 deletion(-) diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 58b71ddf9b60..ba4b6acfc545 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -56,3 +56,4 @@ stable kernels. | | | | | | Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 | | Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | +| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index bb92d3875791..bf24d67a1ff1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -439,6 +439,17 @@ config CAVIUM_ERRATUM_23154 If unsure, say Y. +config CAVIUM_ERRATUM_27456 + bool "Cavium erratum 27456: Broadcast TLBI instructions may cause icache corruption" + default y + help + On ThunderX T88 pass 1.x through 2.1 parts, broadcast TLBI + instructions may cause the icache to become corrupted if it + contains data for a non-current ASID. The fix is to + invalidate the icache when changing the mm context. + + If unsure, say Y. + endmenu diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 37a53fc6b384..727e594ac5c2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -33,8 +33,9 @@ #define ARM64_HAS_NO_HW_PREFETCH 8 #define ARM64_HAS_UAO 9 #define ARM64_ALT_PAN_NOT_UAO 10 +#define ARM64_WORKAROUND_CAVIUM_27456 12 -#define ARM64_NCAPS 11 +#define ARM64_NCAPS 13 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index e6bc988e8dbf..06afd04e02c0 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -87,6 +87,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .capability = ARM64_WORKAROUND_CAVIUM_23154, MIDR_RANGE(MIDR_THUNDERX, 0x00, 0x01), }, +#endif +#ifdef CONFIG_CAVIUM_ERRATUM_27456 + { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + .desc = "Cavium erratum 27456", + .capability = ARM64_WORKAROUND_CAVIUM_27456, + MIDR_RANGE(MIDR_THUNDERX, 0x00, + (1 << MIDR_VARIANT_SHIFT) | 1), + }, #endif { } diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 0c19534a901e..543f5198005a 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "proc-macros.S" @@ -137,7 +139,17 @@ ENTRY(cpu_do_switch_mm) bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb +alternative_if_not ARM64_WORKAROUND_CAVIUM_27456 ret + nop + nop + nop +alternative_else + ic iallu + dsb nsh + isb + ret +alternative_endif ENDPROC(cpu_do_switch_mm) .pushsection ".idmap.text", "ax" From b4ad81e8d05dc75ee527c56f6936ebc7fa3f46a4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 18 Mar 2016 10:58:09 +0100 Subject: [PATCH 0476/3220] UPSTREAM: arm64/kernel: fix incorrect EL0 check in inv_entry macro The implementation of macro inv_entry refers to its 'el' argument without the required leading backslash, which results in an undefined symbol 'el' to be passed into the kernel_entry macro rather than the index of the exception level as intended. This undefined symbol strangely enough does not result in build failures, although it is visible in vmlinux: $ nm -n vmlinux |head U el 0000000000000000 A _kernel_flags_le_hi32 0000000000000000 A _kernel_offset_le_hi32 0000000000000000 A _kernel_size_le_hi32 000000000000000a A _kernel_flags_le_lo32 ..... However, it does result in incorrect code being generated for invalid exceptions taken from EL0, since the argument check in kernel_entry assumes EL1 if its argument does not equal '0'. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Change-Id: I406c1207682a4dff3054a019c26fdf1310b08ed1 (cherry picked from commit b660950c60a7278f9d8deb7c32a162031207c758) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 1f7f5a2b61bf..12e8d2bcb3f9 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -277,7 +277,7 @@ END(vectors) * Invalid mode handlers */ .macro inv_entry, el, reason, regsize = 64 - kernel_entry el, \regsize + kernel_entry \el, \regsize mov x0, sp mov x1, #\reason mrs x2, esr_el1 From c1ee3c7d19d2735996d99f6c94a50ac77a9ea338 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 30 Mar 2016 14:25:46 +0200 Subject: [PATCH 0477/3220] UPSTREAM: arm64/mm: ensure memstart_addr remains sufficiently aligned After choosing memstart_addr to be the highest multiple of ARM64_MEMSTART_ALIGN less than or equal to the first usable physical memory address, we clip the memblocks to the maximum size of the linear region. Since the kernel may be high up in memory, we take care not to clip the kernel itself, which means we have to clip some memory from the bottom if this occurs, to ensure that the distance between the first and the last usable physical memory address can be covered by the linear region. However, we fail to update memstart_addr if this clipping from the bottom occurs, which means that we may still end up with virtual addresses that wrap into the userland range. So increment memstart_addr as appropriate to prevent this from happening. Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: I72306207cc46a30b780f5e00b9ef23aa8409867e (cherry picked from commit 2958987f5da2ebcf6a237c5f154d7e3340e60945) Signed-off-by: Sami Tolvanen --- arch/arm64/mm/init.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 08aa45ff05b6..0db12e9e8765 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -192,8 +192,12 @@ void __init arm64_memblock_init(void) */ memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)), ULLONG_MAX); - if (memblock_end_of_DRAM() > linear_region_size) - memblock_remove(0, memblock_end_of_DRAM() - linear_region_size); + if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) { + /* ensure that memstart_addr remains sufficiently aligned */ + memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size, + ARM64_MEMSTART_ALIGN); + memblock_remove(0, memstart_addr); + } /* * Apply the memory limit if it was set. Since the kernel may be loaded From 9297d46fb6479dad23622f61760dd9fb1700442d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 30 Mar 2016 14:25:47 +0200 Subject: [PATCH 0478/3220] UPSTREAM: arm64: choose memstart_addr based on minimum sparsemem section alignment This redefines ARM64_MEMSTART_ALIGN in terms of the minimal alignment required by sparsemem vmemmap. This comes down to using 1 GB for all translation granules if CONFIG_SPARSEMEM_VMEMMAP is enabled. Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: I05b8bc6ab24f677f263b09d7c31fcce4f21269b1 (cherry picked from commit 06e9bf2fd9b372bc1c757995b6ca1cfab0720acb) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/kernel-pgtable.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 5c6375d8528b..7e51d1b57c0c 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -19,6 +19,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H +#include /* * The linear mapping and the start of memory are both 2M aligned (per @@ -86,10 +87,24 @@ * (64k granule), or a multiple that can be mapped using contiguous bits * in the page tables: 32 * PMD_SIZE (16k granule) */ -#ifdef CONFIG_ARM64_64K_PAGES -#define ARM64_MEMSTART_ALIGN SZ_512M +#if defined(CONFIG_ARM64_4K_PAGES) +#define ARM64_MEMSTART_SHIFT PUD_SHIFT +#elif defined(CONFIG_ARM64_16K_PAGES) +#define ARM64_MEMSTART_SHIFT (PMD_SHIFT + 5) #else -#define ARM64_MEMSTART_ALIGN SZ_1G +#define ARM64_MEMSTART_SHIFT PMD_SHIFT +#endif + +/* + * sparsemem vmemmap imposes an additional requirement on the alignment of + * memstart_addr, due to the fact that the base of the vmemmap region + * has a direct correspondence, and needs to appear sufficiently aligned + * in the virtual address space. + */ +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS +#define ARM64_MEMSTART_ALIGN (1UL << SECTION_SIZE_BITS) +#else +#define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT) #endif #endif /* __ASM_KERNEL_PGTABLE_H */ From 29089dcc2c5272437ce9bbdae4ea249ed25e8281 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 13 Apr 2016 16:01:22 +0100 Subject: [PATCH 0479/3220] UPSTREAM: arm64: Implement ptep_set_access_flags() for hardware AF/DBM When hardware updates of the access and dirty states are enabled, the default ptep_set_access_flags() implementation based on calling set_pte_at() directly is potentially racy. This triggers the "racy dirty state clearing" warning in set_pte_at() because an existing writable PTE is overridden with a clean entry. There are two main scenarios for this situation: 1. The CPU getting an access fault does not support hardware updates of the access/dirty flags. However, a different agent in the system (e.g. SMMU) can do this, therefore overriding a writable entry with a clean one could potentially lose the automatically updated dirty status 2. A more complex situation is possible when all CPUs support hardware AF/DBM: a) Initial state: shareable + writable vma and pte_none(pte) b) Read fault taken by two threads of the same process on different CPUs c) CPU0 takes the mmap_sem and proceeds to handling the fault. It eventually reaches do_set_pte() which sets a writable + clean pte. CPU0 releases the mmap_sem d) CPU1 acquires the mmap_sem and proceeds to handle_pte_fault(). The pte entry it reads is present, writable and clean and it continues to pte_mkyoung() e) CPU1 calls ptep_set_access_flags() If between (d) and (e) the hardware (another CPU) updates the dirty state (clears PTE_RDONLY), CPU1 will override the PTR_RDONLY bit marking the entry clean again. This patch implements an arm64-specific ptep_set_access_flags() function to perform an atomic update of the PTE flags. Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits") Signed-off-by: Catalin Marinas Reported-by: Ming Lei Tested-by: Julien Grall Cc: Will Deacon Cc: # 4.3+ [will: reworded comment] Signed-off-by: Will Deacon Change-Id: Id2a0b0d8eb6e7df6325ecb48b88b8401a5dd09e5 (cherry picked from commit 66dbd6e61a526ae7d11a208238ae2c17e5cacb6b) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/pgtable.h | 5 ++++ arch/arm64/mm/fault.c | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d48ea8fc789d..0c82a1d9f31e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -594,6 +594,11 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #ifdef CONFIG_ARM64_HW_AFDBM +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); + /* * Atomic pte/pmd modifications. */ diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 44e56de23f79..9bb45de46085 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -81,6 +81,56 @@ void show_pte(struct mm_struct *mm, unsigned long addr) printk("\n"); } +#ifdef CONFIG_ARM64_HW_AFDBM +/* + * This function sets the access flags (dirty, accessed), as well as write + * permission, and only to a more permissive setting. + * + * It needs to cope with hardware update of the accessed/dirty state by other + * agents in the system and can safely skip the __sync_icache_dcache() call as, + * like set_pte_at(), the PTE is never changed from no-exec to exec here. + * + * Returns whether or not the PTE actually changed. + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + pteval_t old_pteval; + unsigned int tmp; + + if (pte_same(*ptep, entry)) + return 0; + + /* only preserve the access flags and write permission */ + pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY; + + /* + * PTE_RDONLY is cleared by default in the asm below, so set it in + * back if necessary (read-only or clean PTE). + */ + if (!pte_write(entry) || !dirty) + pte_val(entry) |= PTE_RDONLY; + + /* + * Setting the flags must be done atomically to avoid racing with the + * hardware update of the access/dirty state. + */ + asm volatile("// ptep_set_access_flags\n" + " prfm pstl1strm, %2\n" + "1: ldxr %0, %2\n" + " and %0, %0, %3 // clear PTE_RDONLY\n" + " orr %0, %0, %4 // set flags\n" + " stxr %w1, %0, %2\n" + " cbnz %w1, 1b\n" + : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) + : "L" (~PTE_RDONLY), "r" (pte_val(entry))); + + flush_tlb_fix_spurious_fault(vma, address); + return 1; +} +#endif + /* * The kernel tried to access some page that wasn't present. */ From 2515432e66e01f632e49069e7c04b969dc13c3d5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 18 Apr 2016 17:09:44 +0200 Subject: [PATCH 0480/3220] UPSTREAM: arm64: introduce mov_q macro to move a constant into a 64-bit register Implement a macro mov_q that can be used to move an immediate constant into a 64-bit register, using between 2 and 4 movz/movk instructions (depending on the operand) Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Change-Id: I7e6b684e46cad5df79e6b8bc28d72b9e37daedd6 (cherry picked from commit 30b5ba5cf333cc650e474eaf2cc1ae91bc7cf89f) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 70f7b9e04598..9ea846ded55c 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -233,4 +233,24 @@ lr .req x30 // link register .long \sym\()_hi32 .endm + /* + * mov_q - move an immediate constant into a 64-bit register using + * between 2 and 4 movz/movk instructions (depending on the + * magnitude and sign of the operand) + */ + .macro mov_q, reg, val + .if (((\val) >> 31) == 0 || ((\val) >> 31) == 0x1ffffffff) + movz \reg, :abs_g1_s:\val + .else + .if (((\val) >> 47) == 0 || ((\val) >> 47) == 0x1ffff) + movz \reg, :abs_g2_s:\val + .else + movz \reg, :abs_g3:\val + movk \reg, :abs_g2_nc:\val + .endif + movk \reg, :abs_g1_nc:\val + .endif + movk \reg, :abs_g0_nc:\val + .endm + #endif /* __ASM_ASSEMBLER_H */ From 9c8a3a98f3c33eb7665dd5f101f28c46fedb4b75 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Wed, 27 Apr 2016 17:47:00 +0100 Subject: [PATCH 0481/3220] BACKPORT: arm64: Fold proc-macros.S into assembler.h To allow the assembler macros defined in arch/arm64/mm/proc-macros.S to be used outside the mm code move the contents of proc-macros.S to asm/assembler.h. Also, delete proc-macros.S, and fix up all references to proc-macros.S. Signed-off-by: Geoff Levand Acked-by: Pavel Machek [rebased, included dcache_by_line_op] Signed-off-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Will Deacon Change-Id: I09e694442ffd25dcac864216d0369c9727ad0090 (cherry picked from commit 7b7293ae3dbd0a1965bf310b77fed5f9bb37bb93) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 82 ++++++++++++++++++++++++- arch/arm64/mm/cache.S | 2 - arch/arm64/mm/proc-macros.S | 98 ------------------------------ arch/arm64/mm/proc.S | 3 - 4 files changed, 81 insertions(+), 104 deletions(-) delete mode 100644 arch/arm64/mm/proc-macros.S diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 9ea846ded55c..33bbb7ba7fb5 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -1,5 +1,5 @@ /* - * Based on arch/arm/include/asm/assembler.h + * Based on arch/arm/include/asm/assembler.h, arch/arm/mm/proc-macros.S * * Copyright (C) 1996-2000 Russell King * Copyright (C) 2012 ARM Ltd. @@ -23,6 +23,8 @@ #ifndef __ASM_ASSEMBLER_H #define __ASM_ASSEMBLER_H +#include +#include #include #include @@ -211,6 +213,84 @@ lr .req x30 // link register add \reg, \reg, \tmp .endm +/* + * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm) + */ + .macro vma_vm_mm, rd, rn + ldr \rd, [\rn, #VMA_VM_MM] + .endm + +/* + * mmid - get context id from mm pointer (mm->context.id) + */ + .macro mmid, rd, rn + ldr \rd, [\rn, #MM_CONTEXT_ID] + .endm + +/* + * dcache_line_size - get the minimum D-cache line size from the CTR register. + */ + .macro dcache_line_size, reg, tmp + mrs \tmp, ctr_el0 // read CTR + ubfm \tmp, \tmp, #16, #19 // cache line size encoding + mov \reg, #4 // bytes per word + lsl \reg, \reg, \tmp // actual cache line size + .endm + +/* + * icache_line_size - get the minimum I-cache line size from the CTR register. + */ + .macro icache_line_size, reg, tmp + mrs \tmp, ctr_el0 // read CTR + and \tmp, \tmp, #0xf // cache line size encoding + mov \reg, #4 // bytes per word + lsl \reg, \reg, \tmp // actual cache line size + .endm + +/* + * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map + */ + .macro tcr_set_idmap_t0sz, valreg, tmpreg +#ifndef CONFIG_ARM64_VA_BITS_48 + ldr_l \tmpreg, idmap_t0sz + bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH +#endif + .endm + +/* + * Macro to perform a data cache maintenance for the interval + * [kaddr, kaddr + size) + * + * op: operation passed to dc instruction + * domain: domain used in dsb instruciton + * kaddr: starting virtual address of the region + * size: size of the region + * Corrupts: kaddr, size, tmp1, tmp2 + */ + .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 + dcache_line_size \tmp1, \tmp2 + add \size, \kaddr, \size + sub \tmp2, \tmp1, #1 + bic \kaddr, \kaddr, \tmp2 +9998: dc \op, \kaddr + add \kaddr, \kaddr, \tmp1 + cmp \kaddr, \size + b.lo 9998b + dsb \domain + .endm + +/* + * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present + */ + .macro reset_pmuserenr_el0, tmpreg + mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx \tmpreg, \tmpreg, #8, #4 + cmp \tmpreg, #1 // Skip if no PMU present + b.lt 9000f + msr pmuserenr_el0, xzr // Disable PMU access from EL0 +9000: + .endm + /* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 6df07069a025..50ff9ba3a236 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -24,8 +24,6 @@ #include #include -#include "proc-macros.S" - /* * flush_icache_range(start,end) * diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S deleted file mode 100644 index 984edcda1850..000000000000 --- a/arch/arm64/mm/proc-macros.S +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Based on arch/arm/mm/proc-macros.S - * - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include - -/* - * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm) - */ - .macro vma_vm_mm, rd, rn - ldr \rd, [\rn, #VMA_VM_MM] - .endm - -/* - * mmid - get context id from mm pointer (mm->context.id) - */ - .macro mmid, rd, rn - ldr \rd, [\rn, #MM_CONTEXT_ID] - .endm - -/* - * dcache_line_size - get the minimum D-cache line size from the CTR register. - */ - .macro dcache_line_size, reg, tmp - mrs \tmp, ctr_el0 // read CTR - ubfm \tmp, \tmp, #16, #19 // cache line size encoding - mov \reg, #4 // bytes per word - lsl \reg, \reg, \tmp // actual cache line size - .endm - -/* - * icache_line_size - get the minimum I-cache line size from the CTR register. - */ - .macro icache_line_size, reg, tmp - mrs \tmp, ctr_el0 // read CTR - and \tmp, \tmp, #0xf // cache line size encoding - mov \reg, #4 // bytes per word - lsl \reg, \reg, \tmp // actual cache line size - .endm - -/* - * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map - */ - .macro tcr_set_idmap_t0sz, valreg, tmpreg -#ifndef CONFIG_ARM64_VA_BITS_48 - ldr_l \tmpreg, idmap_t0sz - bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH -#endif - .endm - -/* - * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present - */ - .macro reset_pmuserenr_el0, tmpreg - mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer - sbfx \tmpreg, \tmpreg, #8, #4 - cmp \tmpreg, #1 // Skip if no PMU present - b.lt 9000f - msr pmuserenr_el0, xzr // Disable PMU access from EL0 -9000: - .endm - -/* - * Macro to perform a data cache maintenance for the interval - * [kaddr, kaddr + size) - * - * op: operation passed to dc instruction - * domain: domain used in dsb instruciton - * kaddr: starting virtual address of the region - * size: size of the region - * Corrupts: kaddr, size, tmp1, tmp2 - */ - .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 - dcache_line_size \tmp1, \tmp2 - add \size, \kaddr, \size - sub \tmp2, \tmp1, #1 - bic \kaddr, \kaddr, \tmp2 -9998: dc \op, \kaddr - add \kaddr, \kaddr, \tmp1 - cmp \kaddr, \size - b.lo 9998b - dsb \domain - .endm diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 543f5198005a..9f6deacf41d2 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -23,13 +23,10 @@ #include #include #include -#include #include #include #include -#include "proc-macros.S" - #ifdef CONFIG_ARM64_64K_PAGES #define TCR_TG_FLAGS TCR_TG0_64K | TCR_TG1_64K #elif defined(CONFIG_ARM64_16K_PAGES) From fdc4c4805c5afb51fdd0e03ee55281d9b10f25c1 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 13 Jun 2016 11:15:14 +0100 Subject: [PATCH 0482/3220] UPSTREAM: arm64: fix dump_instr when PAN and UAO are in use If the kernel is set to show unhandled signals, and a user task does not handle a SIGILL as a result of an instruction abort, we will attempt to log the offending instruction with dump_instr before killing the task. We use dump_instr to log the encoding of the offending userspace instruction. However, dump_instr is also used to dump instructions from kernel space, and internally always switches to KERNEL_DS before dumping the instruction with get_user. When both PAN and UAO are in use, reading a user instruction via get_user while in KERNEL_DS will result in a permission fault, which leads to an Oops. As we have regs corresponding to the context of the original instruction abort, we can inspect this and only flip to KERNEL_DS if the original abort was taken from the kernel, avoiding this issue. At the same time, remove the redundant (and incorrect) comments regarding the order dump_mem and dump_instr are called in. Cc: Catalin Marinas Cc: James Morse Cc: Robin Murphy Cc: #4.6+ Signed-off-by: Mark Rutland Reported-by: Vladimir Murzin Tested-by: Vladimir Murzin Fixes: 57f4959bad0a154a ("arm64: kernel: Add support for User Access Override") Signed-off-by: Will Deacon Change-Id: I54c00f3598d227a7e2767b357cb453075dcce7bd (cherry picked from commit c5cea06be060f38e5400d796e61cfc8c36e52924) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/traps.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index c5392081b49b..58651a9dfcf8 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -64,8 +64,7 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom, /* * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. Note that we now dump the - * code first, just in case the backtrace kills us. + * to safely read from kernel space. */ fs = get_fs(); set_fs(KERNEL_DS); @@ -111,21 +110,12 @@ static void dump_backtrace_entry(unsigned long where) print_ip_sym(where); } -static void dump_instr(const char *lvl, struct pt_regs *regs) +static void __dump_instr(const char *lvl, struct pt_regs *regs) { unsigned long addr = instruction_pointer(regs); - mm_segment_t fs; char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; int i; - /* - * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. Note that we now dump the - * code first, just in case the backtrace kills us. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - for (i = -4; i < 1; i++) { unsigned int val, bad; @@ -139,8 +129,18 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) } } printk("%sCode: %s\n", lvl, str); +} - set_fs(fs); +static void dump_instr(const char *lvl, struct pt_regs *regs) +{ + if (!user_mode(regs)) { + mm_segment_t fs = get_fs(); + set_fs(KERNEL_DS); + __dump_instr(lvl, regs); + set_fs(fs); + } else { + __dump_instr(lvl, regs); + } } static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) From 916a99af4c1bf19207dd4589b1704ab144407c77 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 13 Jun 2016 17:57:02 +0100 Subject: [PATCH 0483/3220] UPSTREAM: arm64: mm: mark fault_info table const Unlike the debug_fault_info table, we never intentionally alter the fault_info table at runtime, and all derived pointers are treated as const currently. Make the table const so that it can be placed in .rodata and protected from unintentional writes, as we do for the syscall tables. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Will Deacon Change-Id: I3fb0bb55427835c165cc377d8dc2a3fa9e6e950d (cherry picked from commit bbb1681ee3653bdcfc6a4ba31902738118311fd4) Signed-off-by: Sami Tolvanen --- arch/arm64/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9bb45de46085..b5773944d1d3 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -438,7 +438,7 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 1; } -static struct fault_info { +static const struct fault_info { int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs); int sig; int code; From 32054524cb03d884215a6051ad8c21077c241250 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 31 May 2016 12:33:01 +0100 Subject: [PATCH 0484/3220] UPSTREAM: arm64: add macro to extract ESR_ELx.EC Several places open-code extraction of the EC field from an ESR_ELx value, in subtly different ways. This is unfortunate duplication and variation, and the precise logic used to extract the field is a distraction. This patch adds a new macro, ESR_ELx_EC(), to extract the EC field from an ESR_ELx value in a consistent fashion. Existing open-coded extractions in core arm64 code are moved over to the new helper. KVM code is left as-is for the moment. Signed-off-by: Mark Rutland Tested-by: Huang Shijie Cc: Dave P Martin Cc: James Morse Cc: Marc Zyngier Cc: Will Deacon Signed-off-by: Catalin Marinas Change-Id: Ib634a4795277d243fce5dd30b139e2ec1465bee9 (cherry picked from commit 275f344bec51e9100bae81f3cc8c6940bbfb24c0) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/esr.h | 1 + arch/arm64/kernel/traps.c | 2 +- arch/arm64/mm/fault.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 77eeb2cc648f..f772e15c4766 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -74,6 +74,7 @@ #define ESR_ELx_EC_SHIFT (26) #define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) +#define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) #define ESR_ELx_IL (UL(1) << 25) #define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 58651a9dfcf8..29d7b68f63f0 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -465,7 +465,7 @@ static const char *esr_class_str[] = { const char *esr_get_class_string(u32 esr) { - return esr_class_str[esr >> ESR_ELx_EC_SHIFT]; + return esr_class_str[ESR_ELx_EC(esr)]; } /* diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index b5773944d1d3..59b8e1db6338 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -244,7 +244,7 @@ out: static inline int permission_fault(unsigned int esr) { - unsigned int ec = (esr & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT; + unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); From b8f39d900e1581dd99788c1e38989b93d114dced Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 31 May 2016 12:33:03 +0100 Subject: [PATCH 0485/3220] UPSTREAM: arm64: kill ESR_LNX_EXEC Currently we treat ESR_EL1 bit 24 as software-defined for distinguishing instruction aborts from data aborts, but this bit is architecturally RES0 for instruction aborts, and could be allocated for an arbitrary purpose in future. Additionally, we hard-code the value in entry.S without the mnemonic, making the code difficult to understand. Instead, remove ESR_LNX_EXEC, and distinguish aborts based on the esr, which we already pass to the sole use of ESR_LNX_EXEC. A new helper, is_el0_instruction_abort() is added to make the logic clear. Any instruction aborts taken from EL1 will already have been handled by bad_mode, so we need not handle that case in the helper. For consistency, the existing permission_fault helper is renamed to is_permission_fault, and the return type is changed to bool. There should be no functional changes as the return value was a boolean expression, and the result is only used in another boolean expression. Signed-off-by: Mark Rutland Cc: Dave P Martin Cc: Huang Shijie Cc: James Morse Cc: Marc Zyngier Cc: Will Deacon Signed-off-by: Catalin Marinas Change-Id: Iaf66fa5f3b13cf985b11a3b0a40c4333fe9ef833 (cherry picked from commit 541ec870ef31433018d245614254bd9d810a9ac3) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/entry.S | 2 +- arch/arm64/mm/fault.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 12e8d2bcb3f9..eefffa81c6df 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -532,7 +532,7 @@ el0_ia: enable_dbg_and_irq ct_user_exit mov x0, x26 - orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts + mov x1, x25 mov x2, sp bl do_mem_abort b ret_to_user diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 59b8e1db6338..56c6bfbd19b4 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -202,8 +202,6 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re #define VM_FAULT_BADMAP 0x010000 #define VM_FAULT_BADACCESS 0x020000 -#define ESR_LNX_EXEC (1 << 24) - static int __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int mm_flags, unsigned long vm_flags, struct task_struct *tsk) @@ -242,7 +240,7 @@ out: return fault; } -static inline int permission_fault(unsigned int esr) +static inline bool is_permission_fault(unsigned int esr) { unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; @@ -250,6 +248,11 @@ static inline int permission_fault(unsigned int esr) return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); } +static bool is_el0_instruction_abort(unsigned int esr) +{ + return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; +} + static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -276,14 +279,14 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (user_mode(regs)) mm_flags |= FAULT_FLAG_USER; - if (esr & ESR_LNX_EXEC) { + if (is_el0_instruction_abort(esr)) { vm_flags = VM_EXEC; } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) { vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } - if (permission_fault(esr) && (addr < USER_DS)) { + if (is_permission_fault(esr) && (addr < USER_DS)) { if (get_fs() == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); From 9887915a4ecf3697d056f01f8e7fe09c853f93bb Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Wed, 27 Apr 2016 17:47:10 +0100 Subject: [PATCH 0486/3220] UPSTREAM: arm64: Add new asm macro copy_page Kexec and hibernate need to copy pages of memory, but may not have all of the kernel mapped, and are unable to call copy_page(). Add a simplistic copy_page() macro, that can be inlined in these situations. lib/copy_page.S provides a bigger better version, but uses more registers. Signed-off-by: Geoff Levand [Changed asm label to 9998, added commit message] Signed-off-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Will Deacon Change-Id: If23a454e211b1f57f8ba1a2a00b44dabf4b82932 (cherry picked from commit 5003dbde45961dd7ab3d8a09ab9ad8bcb604db40) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 33bbb7ba7fb5..290e13428f4a 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -24,6 +24,7 @@ #define __ASM_ASSEMBLER_H #include +#include #include #include #include @@ -291,6 +292,24 @@ lr .req x30 // link register 9000: .endm +/* + * copy_page - copy src to dest using temp registers t1-t8 + */ + .macro copy_page dest:req src:req t1:req t2:req t3:req t4:req t5:req t6:req t7:req t8:req +9998: ldp \t1, \t2, [\src] + ldp \t3, \t4, [\src, #16] + ldp \t5, \t6, [\src, #32] + ldp \t7, \t8, [\src, #48] + add \src, \src, #64 + stnp \t1, \t2, [\dest] + stnp \t3, \t4, [\dest, #16] + stnp \t5, \t6, [\dest, #32] + stnp \t7, \t8, [\dest, #48] + add \dest, \dest, #64 + tst \src, #(PAGE_SIZE - 1) + b.ne 9998b + .endm + /* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. From 1d8922da229f3b2b4b6a9e75ad7dd75f862462bb Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 28 Jun 2016 18:07:27 +0100 Subject: [PATCH 0487/3220] UPSTREAM: Revert "arm64: alternatives: add enable parameter to conditional asm macros" Commit 77ee306c0aea9 ("arm64: alternatives: add enable parameter to conditional asm macros") extended the alternative assembly macros. Unfortunately this does not really work as one would expect, as the enable parameter in fact correctly protects the alternative section magic, but not the actual code sequences. This results in having both the original instruction(s) _and_ the alternative ones, if enable if false. Since there is no user of this macros anyway, just revert it. This reverts commit 77ee306c0aea9a219daec256ad25982944affef8. Signed-off-by: Andre Przywara Signed-off-by: Catalin Marinas Change-Id: I608104891335dfa2dacdb364754ae2658088ddf2 (cherry picked from commit b82bfa4793cd0f8fde49b85e0ad66906682e7447) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/alternative.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index beccbdefa106..7288071195e1 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -95,13 +95,11 @@ void apply_alternatives(void *start, size_t length); * The code that follows this macro will be assembled and linked as * normal. There are no restrictions on this code. */ -.macro alternative_if_not cap, enable = 1 - .if \enable +.macro alternative_if_not cap .pushsection .altinstructions, "a" altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f .popsection 661: - .endif .endm /* @@ -118,22 +116,18 @@ void apply_alternatives(void *start, size_t length); * alternative sequence it is defined in (branches into an * alternative sequence are not fixed up). */ -.macro alternative_else, enable = 1 - .if \enable +.macro alternative_else 662: .pushsection .altinstr_replacement, "ax" 663: - .endif .endm /* * Complete an alternative code sequence. */ -.macro alternative_endif, enable = 1 - .if \enable +.macro alternative_endif 664: .popsection .org . - (664b-663b) + (662b-661b) .org . - (662b-661b) + (664b-663b) - .endif .endm #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ From 405893f77e25abf7584f8ea65567c032849e41dd Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 28 Jun 2016 18:07:28 +0100 Subject: [PATCH 0488/3220] UPSTREAM: arm64: fix "dc cvau" cache operation on errata-affected core The ARM errata 819472, 826319, 827319 and 824069 for affected Cortex-A53 cores demand to promote "dc cvau" instructions to "dc civac" as well. Attribute the usage of the instruction in __flush_cache_user_range to also be covered by our alternative patching efforts. For that we introduce an assembly macro which both deals with alternatives while still tagging the instructions as USER. Signed-off-by: Andre Przywara Signed-off-by: Catalin Marinas Change-Id: If5e7933ba32331b2aa28fc5d9e019649452f0f6c (cherry picked from commit 290622efc76ece22ef76a30bf117755891ab27f6) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/alternative.h | 4 ++++ arch/arm64/mm/cache.S | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 7288071195e1..8746ff6abd77 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -133,6 +133,10 @@ void apply_alternatives(void *start, size_t length); #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) +.macro user_alt, label, oldinstr, newinstr, cond +9999: alternative_insn "\oldinstr", "\newinstr", \cond + _ASM_EXTABLE 9999b, \label +.endm /* * Generate the assembly for UAO alternatives with exception table entries. diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 50ff9ba3a236..07d7352d7c38 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -52,7 +52,7 @@ ENTRY(__flush_cache_user_range) sub x3, x2, #1 bic x4, x0, x3 1: -USER(9f, dc cvau, x4 ) // clean D line to PoU +user_alt 9f, "dc cvau, x4", "dc civac, x4", ARM64_WORKAROUND_CLEAN_CACHE add x4, x4, x2 cmp x4, x1 b.lo 1b From 76440ecb4dfc5f76582199a32b6c33ee9f0aa221 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 28 Jun 2016 18:07:29 +0100 Subject: [PATCH 0489/3220] UPSTREAM: arm64: include alternative handling in dcache_by_line_op The newly introduced dcache_by_line_op macro is used at least in one occassion at the moment to issue a "dc cvau" instruction, which is affected by ARM errata 819472, 826319, 827319 and 824069. Change the macro to allow for alternative patching in there to protect affected Cortex-A53 cores. Signed-off-by: Andre Przywara [catalin.marinas@arm.com: indentation fixups] Signed-off-by: Catalin Marinas Change-Id: I450594dc311b09b6b832b707a9abb357608cc6e4 (cherry picked from commit 823066d9edcdfe4cedb06216c2b1f91efaf68a87) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 290e13428f4a..9e8ac1e73457 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -24,6 +24,7 @@ #define __ASM_ASSEMBLER_H #include +#include #include #include #include @@ -273,7 +274,16 @@ lr .req x30 // link register add \size, \kaddr, \size sub \tmp2, \tmp1, #1 bic \kaddr, \kaddr, \tmp2 -9998: dc \op, \kaddr +9998: + .if (\op == cvau || \op == cvac) +alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE + dc \op, \kaddr +alternative_else + dc civac, \kaddr +alternative_endif + .else + dc \op, \kaddr + .endif add \kaddr, \kaddr, \tmp1 cmp \kaddr, \size b.lo 9998b From 4caec81662608819e904586b172abc85945b4d65 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 20 Jun 2016 18:28:01 +0100 Subject: [PATCH 0490/3220] BACKPORT: arm64: kernel: Save and restore UAO and addr_limit on exception entry If we take an exception while at EL1, the exception handler inherits the original context's addr_limit and PSTATE.UAO values. To be consistent always reset addr_limit and PSTATE.UAO on (re-)entry to EL1. This prevents accidental re-use of the original context's addr_limit. Based on a similar patch for arm from Russell King. Cc: # 4.6- Acked-by: Will Deacon Reviewed-by: Mark Rutland Signed-off-by: James Morse Signed-off-by: Will Deacon Change-Id: Iab453201c6e08bc6e22500b7c5570dd0fe2d1b74 (cherry picked from commit e19a6ee2460bdd0d0055a6029383422773f9999a) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/ptrace.h | 2 ++ arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kernel/entry.S | 19 +++++++++++++++++-- arch/arm64/mm/fault.c | 3 ++- 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index e9e5467e0bf4..200d5c32b38f 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -116,6 +116,8 @@ struct pt_regs { }; u64 orig_x0; u64 syscallno; + u64 orig_addr_limit; + u64 unused; // maintain 16 byte alignment }; #define arch_has_single_step() (1) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 25de8b244961..087cf9a65359 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -58,6 +58,7 @@ int main(void) DEFINE(S_PC, offsetof(struct pt_regs, pc)); DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); + DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); BLANK(); DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index eefffa81c6df..9fabed5dd2bf 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -97,7 +98,14 @@ mov x29, xzr // fp pointed to user-space .else add x21, sp, #S_FRAME_SIZE - .endif + get_thread_info tsk + /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ + ldr x20, [tsk, #TI_ADDR_LIMIT] + str x20, [sp, #S_ORIG_ADDR_LIMIT] + mov x20, #TASK_SIZE_64 + str x20, [tsk, #TI_ADDR_LIMIT] + ALTERNATIVE(nop, SET_PSTATE_UAO(0), ARM64_HAS_UAO, CONFIG_ARM64_UAO) + .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] @@ -128,6 +136,14 @@ .endm .macro kernel_exit, el + .if \el != 0 + /* Restore the task's original addr_limit. */ + ldr x20, [sp, #S_ORIG_ADDR_LIMIT] + str x20, [tsk, #TI_ADDR_LIMIT] + + /* No need to restore UAO, it will be restored from SPSR_EL1 */ + .endif + ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter @@ -406,7 +422,6 @@ el1_irq: bl trace_hardirqs_off #endif - get_thread_info tsk irq_handler #ifdef CONFIG_PREEMPT diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 56c6bfbd19b4..03d44388ae7b 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -287,7 +287,8 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (is_permission_fault(esr) && (addr < USER_DS)) { - if (get_fs() == KERNEL_DS) + /* regs->orig_addr_limit may be 0 if we entered from EL0 */ + if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); if (!search_exception_tables(regs->pc)) From d907eb252977469ed9bded38bd76c2212065c2cf Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 9 Aug 2016 18:25:26 -0700 Subject: [PATCH 0491/3220] UPSTREAM: arm64: Handle el1 synchronous instruction aborts cleanly Executing from a non-executable area gives an ugly message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0e08 lkdtm: attempting bad execution at ffff000008880700 Bad mode in Synchronous Abort handler detected on CPU2, code 0x8400000e -- IABT (current EL) CPU: 2 PID: 998 Comm: sh Not tainted 4.7.0-rc2+ #13 Hardware name: linux,dummy-virt (DT) task: ffff800077e35780 ti: ffff800077970000 task.ti: ffff800077970000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 The 'IABT (current EL)' indicates the error but it's a bit cryptic without knowledge of the ARM ARM. There is also no indication of the specific address which triggered the fault. The increase in kernel page permissions makes hitting this case more likely as well. Handling the case in the vectors gives a much more familiar looking error message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0840 lkdtm: attempting bad execution at ffff000008880680 Unable to handle kernel paging request at virtual address ffff000008880680 pgd = ffff8000089b2000 [ffff000008880680] *pgd=00000000489b4003, *pud=0000000048904003, *pmd=0000000000000000 Internal error: Oops: 8400000e [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 997 Comm: sh Not tainted 4.7.0-rc1+ #24 Hardware name: linux,dummy-virt (DT) task: ffff800077f9f080 ti: ffff800008a1c000 task.ti: ffff800008a1c000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 Acked-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Catalin Marinas Change-Id: Ifba74589ba2cf05b28335d4fd3e3140ef73668db (cherry picked from commit 9adeb8e72dbfe976709df01e259ed556ee60e779) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/entry.S | 7 +++++++ arch/arm64/mm/fault.c | 14 ++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 9fabed5dd2bf..533e1c9fd5a6 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -352,6 +352,8 @@ el1_sync: lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1 b.eq el1_da + cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1 + b.eq el1_ia cmp x24, #ESR_ELx_EC_SYS64 // configurable trap b.eq el1_undef cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception @@ -363,6 +365,11 @@ el1_sync: cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1 b.ge el1_dbg b el1_inv + +el1_ia: + /* + * Fall through to the Data abort case + */ el1_da: /* * Data abort handling diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 03d44388ae7b..e328430dbcec 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -131,6 +131,11 @@ int ptep_set_access_flags(struct vm_area_struct *vma, } #endif +static bool is_el1_instruction_abort(unsigned int esr) +{ + return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR; +} + /* * The kernel tried to access some page that wasn't present. */ @@ -139,8 +144,9 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, { /* * Are we prepared to handle this kernel fault? + * We are almost certainly not prepared to handle instruction faults. */ - if (fixup_exception(regs)) + if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) return; /* @@ -245,7 +251,8 @@ static inline bool is_permission_fault(unsigned int esr) unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); + return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) || + (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM); } static bool is_el0_instruction_abort(unsigned int esr) @@ -291,6 +298,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); + if (is_el1_instruction_abort(esr)) + die("Attempting to execute userspace memory", regs, esr); + if (!search_exception_tables(regs->pc)) die("Accessing user space memory outside uaccess.h routines", regs, esr); } From 23368b642deb01ac6ce668ec1dedfcc0cab25c71 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 14:58:21 +0100 Subject: [PATCH 0492/3220] FROMLIST: arm64: Factor out PAN enabling/disabling into separate uaccess_* macros This patch moves the directly coded alternatives for turning PAN on/off into separate uaccess_{enable,disable} macros or functions. The asm macros take a few arguments which will be used in subsequent patches. Note that any (unlikely) access that the compiler might generate between uaccess_enable() and uaccess_disable(), other than those explicitly specified by the user access code, will not be protected by PAN. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: Ic3fddd706400c8798f57456c56361d84d234f6ef (cherry picked from commit a4820644c627b82cbc865f2425bb788c94743b16) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/futex.h | 14 ++--- arch/arm64/include/asm/uaccess.h | 79 +++++++++++++++++++++++++--- arch/arm64/kernel/armv8_deprecated.c | 10 ++-- arch/arm64/lib/clear_user.S | 8 ++- arch/arm64/lib/copy_from_user.S | 8 ++- arch/arm64/lib/copy_in_user.S | 8 ++- arch/arm64/lib/copy_to_user.S | 8 ++- 7 files changed, 95 insertions(+), 40 deletions(-) diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index f2585cdd32c2..71dfa3b42313 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -27,9 +27,9 @@ #include #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ +do { \ + uaccess_enable(); \ asm volatile( \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ insn "\n" \ @@ -44,11 +44,11 @@ " .popsection\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ : "r" (oparg), "Ir" (-EFAULT) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) @@ -118,8 +118,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; + uaccess_enable(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" -ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" " sub %w3, %w1, %w4\n" @@ -134,10 +134,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " .popsection\n" _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 4b) -ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) : "memory"); + uaccess_disable(); *uval = val; return ret; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index c3d445b42351..c8ef22a9a83b 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -18,6 +18,8 @@ #ifndef __ASM_UACCESS_H #define __ASM_UACCESS_H +#ifndef __ASSEMBLY__ + /* * User space memory access functions */ @@ -123,6 +125,44 @@ static inline void set_fs(mm_segment_t fs) " .long (" #from " - .), (" #to " - .)\n" \ " .popsection\n" +/* + * User access enabling/disabling. + */ +#define __uaccess_disable(alt) \ +do { \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +#define __uaccess_enable(alt) \ +do { \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +static inline void uaccess_disable(void) +{ + __uaccess_disable(ARM64_HAS_PAN); +} + +static inline void uaccess_enable(void) +{ + __uaccess_enable(ARM64_HAS_PAN); +} + +/* + * These functions are no-ops when UAO is present. + */ +static inline void uaccess_disable_not_uao(void) +{ + __uaccess_disable(ARM64_ALT_PAN_NOT_UAO); +} + +static inline void uaccess_enable_not_uao(void) +{ + __uaccess_enable(ARM64_ALT_PAN_NOT_UAO); +} + /* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" @@ -150,8 +190,7 @@ static inline void set_fs(mm_segment_t fs) do { \ unsigned long __gu_val; \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ @@ -172,9 +211,8 @@ do { \ default: \ BUILD_BUG(); \ } \ + uaccess_disable_not_uao(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ } while (0) #define __get_user(x, ptr) \ @@ -219,8 +257,7 @@ do { \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ @@ -241,8 +278,7 @@ do { \ default: \ BUILD_BUG(); \ } \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_disable_not_uao(); \ } while (0) #define __put_user(x, ptr) \ @@ -327,4 +363,31 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); +#else /* __ASSEMBLY__ */ + +#include +#include + +/* + * User access enabling/disabling macros. These are no-ops when UAO is + * present. + */ + .macro uaccess_disable_not_uao, tmp1 +alternative_if_not ARM64_ALT_PAN_NOT_UAO + nop +alternative_else + SET_PSTATE_PAN(1) +alternative_endif + .endm + + .macro uaccess_enable_not_uao, tmp1, tmp2 +alternative_if_not ARM64_ALT_PAN_NOT_UAO + nop +alternative_else + SET_PSTATE_PAN(0) +alternative_endif + .endm + +#endif /* __ASSEMBLY__ */ + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index c37202c0c838..562f7ddb158b 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -281,9 +281,9 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) * Error-checking SWP macros implemented using ldxr{b}/stxr{b} */ #define __user_swpX_asm(data, addr, res, temp, B) \ +do { \ + uaccess_enable(); \ __asm__ __volatile__( \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ "0: ldxr"B" %w2, [%3]\n" \ "1: stxr"B" %w0, %w1, [%3]\n" \ " cbz %w0, 2f\n" \ @@ -299,11 +299,11 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) " .popsection" \ _ASM_EXTABLE(0b, 4b) \ _ASM_EXTABLE(1b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp) \ : "r" (addr), "i" (-EAGAIN), "i" (-EFAULT) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) #define __user_swp_asm(data, addr, res, temp) \ __user_swpX_asm(data, addr, res, temp, "") diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 5d1cad3ce6d6..08b5f18ba604 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -17,10 +17,10 @@ */ #include -#include #include #include #include +#include .text @@ -33,8 +33,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x2, x3 mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f @@ -54,8 +53,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x2 ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 0b90497d4424..6505ec81f1da 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -16,11 +16,11 @@ #include -#include #include #include #include #include +#include /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -67,12 +67,10 @@ end .req x5 ENTRY(__arch_copy_from_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index f7292dd08c84..9b04ff3ab610 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -18,11 +18,11 @@ #include -#include #include #include #include #include +#include /* * Copy from user space to user space (alignment handled by the hardware) @@ -68,12 +68,10 @@ end .req x5 ENTRY(__copy_in_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__copy_in_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 7a7efe255034..8077e4f34d56 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -16,11 +16,11 @@ #include -#include #include #include #include #include +#include /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -66,12 +66,10 @@ end .req x5 ENTRY(__arch_copy_to_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__arch_copy_to_user) From 3b66929169de053042d47e482dd5748794756153 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 15:48:55 +0100 Subject: [PATCH 0493/3220] FROMLIST: arm64: Factor out TTBR0_EL1 post-update workaround into a specific asm macro This patch takes the errata workaround code out of cpu_do_switch_mm into a dedicated post_ttbr0_update_workaround macro which will be reused in a subsequent patch. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: I69f94e4c41046bd52ca9340b72d97bfcf955b586 (cherry picked from commit 4398e6a1644373a4c2f535f4153c8378d0914630) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 17 +++++++++++++++++ arch/arm64/mm/proc.S | 11 +---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 9e8ac1e73457..9d3e77a5cf07 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -362,4 +362,21 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm +/* + * Errata workaround post TTBR0_EL1 update. + */ + .macro post_ttbr0_update_workaround +#ifdef CONFIG_CAVIUM_ERRATUM_27456 +alternative_if_not ARM64_WORKAROUND_CAVIUM_27456 + nop + nop + nop +alternative_else + ic iallu + dsb nsh + isb +alternative_endif +#endif + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 9f6deacf41d2..765713702625 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -136,17 +136,8 @@ ENTRY(cpu_do_switch_mm) bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb -alternative_if_not ARM64_WORKAROUND_CAVIUM_27456 + post_ttbr0_update_workaround ret - nop - nop - nop -alternative_else - ic iallu - dsb nsh - isb - ret -alternative_endif ENDPROC(cpu_do_switch_mm) .pushsection ".idmap.text", "ax" From 1911d36b27ba58ee18592df25b7ee636d4d4c41d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 16:53:00 +0100 Subject: [PATCH 0494/3220] FROMLIST: arm64: Introduce uaccess_{disable,enable} functionality based on TTBR0_EL1 This patch adds the uaccess macros/functions to disable access to user space by setting TTBR0_EL1 to a reserved zeroed page. Since the value written to TTBR0_EL1 must be a physical address, for simplicity this patch introduces a reserved_ttbr0 page at a constant offset from swapper_pg_dir. The uaccess_disable code uses the ttbr1_el1 value adjusted by the reserved_ttbr0 offset. Enabling access to user is done by restoring TTBR0_EL1 with the value from the struct thread_info ttbr0 variable. Interrupts must be disabled during the uaccess_ttbr0_enable code to ensure the atomicity of the thread_info.ttbr0 read and TTBR0_EL1 write. This patch also moves the get_thread_info asm macro from entry.S to assembler.h for reuse in the uaccess_ttbr0_* macros. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: Idf09a870b8612dce23215bce90d88781f0c0c3aa (cherry picked from commit 940d37234182d2675ab8ab46084840212d735018) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 16 +++++ arch/arm64/include/asm/cpufeature.h | 6 ++ arch/arm64/include/asm/kernel-pgtable.h | 7 ++ arch/arm64/include/asm/thread_info.h | 3 + arch/arm64/include/asm/uaccess.h | 96 +++++++++++++++++++++++-- arch/arm64/kernel/asm-offsets.c | 3 + arch/arm64/kernel/cpufeature.c | 1 + arch/arm64/kernel/entry.S | 4 -- arch/arm64/kernel/head.S | 6 +- arch/arm64/kernel/vmlinux.lds.S | 5 ++ 10 files changed, 134 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 9d3e77a5cf07..aeb4554b3af3 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -53,6 +53,15 @@ msr daifclr, #2 .endm + .macro save_and_disable_irq, flags + mrs \flags, daif + msr daifset, #2 + .endm + + .macro restore_irq, flags + msr daif, \flags + .endm + /* * Enable and disable debug exceptions. */ @@ -362,6 +371,13 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm +/* + * Return the current thread_info. + */ + .macro get_thread_info, rd + mrs \rd, sp_el0 + .endm + /* * Errata workaround post TTBR0_EL1 update. */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 727e594ac5c2..f125c03ab2e1 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -188,6 +188,12 @@ static inline bool system_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1)); } +static inline bool system_uses_ttbr0_pan(void) +{ + return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && + !cpus_have_cap(ARM64_HAS_PAN); +} + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 7e51d1b57c0c..7803343e5881 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -19,6 +19,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H +#include #include /* @@ -54,6 +55,12 @@ #define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE) #define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +#define RESERVED_TTBR0_SIZE (PAGE_SIZE) +#else +#define RESERVED_TTBR0_SIZE (0) +#endif + /* Initial memory map size */ #if ARM64_SWAPPER_USES_SECTION_MAPS #define SWAPPER_BLOCK_SHIFT SECTION_SHIFT diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index abd64bd1f6d9..b3325a9cb90f 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -47,6 +47,9 @@ typedef unsigned long mm_segment_t; struct thread_info { unsigned long flags; /* low level flags */ mm_segment_t addr_limit; /* address limit */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + u64 ttbr0; /* saved TTBR0_EL1 */ +#endif struct task_struct *task; /* main task structure */ int preempt_count; /* 0 => preemptable, <0 => bug */ int cpu; /* cpu */ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index c8ef22a9a83b..c37c064d7cdd 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -128,16 +129,57 @@ static inline void set_fs(mm_segment_t fs) /* * User access enabling/disabling. */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void uaccess_ttbr0_disable(void) +{ + unsigned long ttbr; + + /* reserved_ttbr0 placed at the end of swapper_pg_dir */ + ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE; + write_sysreg(ttbr, ttbr0_el1); + isb(); +} + +static inline void uaccess_ttbr0_enable(void) +{ + unsigned long flags; + + /* + * Disable interrupts to avoid preemption between reading the 'ttbr0' + * variable and the MSR. A context switch could trigger an ASID + * roll-over and an update of 'ttbr0'. + */ + local_irq_save(flags); + write_sysreg(current_thread_info()->ttbr0, ttbr0_el1); + isb(); + local_irq_restore(flags); +} +#else +static inline void uaccess_ttbr0_disable(void) +{ +} + +static inline void uaccess_ttbr0_enable(void) +{ +} +#endif + #define __uaccess_disable(alt) \ do { \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ - CONFIG_ARM64_PAN)); \ + if (system_uses_ttbr0_pan()) \ + uaccess_ttbr0_disable(); \ + else \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ + CONFIG_ARM64_PAN)); \ } while (0) #define __uaccess_enable(alt) \ do { \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ - CONFIG_ARM64_PAN)); \ + if (system_uses_ttbr0_pan()) \ + uaccess_ttbr0_enable(); \ + else \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ + CONFIG_ARM64_PAN)); \ } while (0) static inline void uaccess_disable(void) @@ -367,12 +409,39 @@ extern __must_check long strnlen_user(const char __user *str, long n); #include #include +#include /* - * User access enabling/disabling macros. These are no-ops when UAO is - * present. + * User access enabling/disabling macros. + */ + .macro uaccess_ttbr0_disable, tmp1 + mrs \tmp1, ttbr1_el1 // swapper_pg_dir + add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir + msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 + isb + .endm + + .macro uaccess_ttbr0_enable, tmp1 + get_thread_info \tmp1 + ldr \tmp1, [\tmp1, #TI_TTBR0] // load saved TTBR0_EL1 + msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1 + isb + .endm + +/* + * These macros are no-ops when UAO is present. */ .macro uaccess_disable_not_uao, tmp1 +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +alternative_if_not ARM64_HAS_PAN + uaccess_ttbr0_disable \tmp1 +alternative_else + nop + nop + nop + nop +alternative_endif +#endif alternative_if_not ARM64_ALT_PAN_NOT_UAO nop alternative_else @@ -381,6 +450,21 @@ alternative_endif .endm .macro uaccess_enable_not_uao, tmp1, tmp2 +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +alternative_if_not ARM64_HAS_PAN + save_and_disable_irq \tmp2 // avoid preemption + uaccess_ttbr0_enable \tmp1 + restore_irq \tmp2 +alternative_else + nop + nop + nop + nop + nop + nop + nop +alternative_endif +#endif alternative_if_not ARM64_ALT_PAN_NOT_UAO nop alternative_else diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 087cf9a65359..d0ec987dba5b 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -36,6 +36,9 @@ int main(void) DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TI_TTBR0, offsetof(struct thread_info, ttbr0)); +#endif DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); BLANK(); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7566cad9fa1d..40ee3f2933e7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -43,6 +43,7 @@ unsigned int compat_elf_hwcap2 __read_mostly; #endif DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); +EXPORT_SYMBOL(cpu_hwcaps); #define __ARM64_FTR_BITS(SIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 533e1c9fd5a6..8eb8eb085036 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -190,10 +190,6 @@ alternative_endif eret // return to kernel .endm - .macro get_thread_info, rd - mrs \rd, sp_el0 - .endm - .macro irq_stack_entry mov x19, sp // preserve the original sp diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 54fd1e2590a2..9bfa58fea8ce 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -321,14 +321,14 @@ __create_page_tables: * dirty cache lines being evicted. */ mov x0, x25 - add x1, x26, #SWAPPER_DIR_SIZE + add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE bl __inval_cache_range /* * Clear the idmap and swapper page tables. */ mov x0, x25 - add x6, x26, #SWAPPER_DIR_SIZE + add x6, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 @@ -406,7 +406,7 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ mov x0, x25 - add x1, x26, #SWAPPER_DIR_SIZE + add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE dmb sy bl __inval_cache_range diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 14813a6ca13b..87fd0556a1bd 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -185,6 +185,11 @@ SECTIONS swapper_pg_dir = .; . += SWAPPER_DIR_SIZE; +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + reserved_ttbr0 = .; + . += RESERVED_TTBR0_SIZE; +#endif + _end = .; STABS_DEBUG From 5775ca34829caf0664c8ccc02fd0e93cb6022e0f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 2 Sep 2016 14:54:03 +0100 Subject: [PATCH 0495/3220] FROMLIST: arm64: Disable TTBR0_EL1 during normal kernel execution When the TTBR0 PAN feature is enabled, the kernel entry points need to disable access to TTBR0_EL1. The PAN status of the interrupted context is stored as part of the saved pstate, reusing the PSR_PAN_BIT (22). Restoring access to TTBR0_PAN is done on exception return if returning to user or returning to a context where PAN was disabled. Context switching via switch_mm() must defer the update of TTBR0_EL1 until a return to user or an explicit uaccess_enable() call. Special care needs to be taken for two cases where TTBR0_EL1 is set outside the normal kernel context switch operation: EFI run-time services (via efi_set_pgd) and CPU suspend (via cpu_(un)install_idmap). Code has been added to avoid deferred TTBR0_EL1 switching as in switch_mm() and restore the reserved TTBR0_EL1 when uninstalling the special TTBR0_EL1. This patch also removes a stale comment on the switch_mm() function. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: Id1198cf1cde022fad10a94f95d698fae91d742aa (cherry picked from commit d26cfd64c973b31f73091c882e07350e14fdd6c9) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/efi.h | 26 ++++++++++- arch/arm64/include/asm/mmu_context.h | 51 +++++++++++++++------ arch/arm64/include/asm/ptrace.h | 2 + arch/arm64/kernel/entry.S | 67 ++++++++++++++++++++++++++++ arch/arm64/kernel/setup.c | 9 ++++ arch/arm64/mm/context.c | 7 ++- 6 files changed, 146 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 8e88a696c9cb..932f5a56d1a6 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -1,6 +1,7 @@ #ifndef _ASM_EFI_H #define _ASM_EFI_H +#include #include #include #include @@ -69,7 +70,30 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); static inline void efi_set_pgd(struct mm_struct *mm) { - switch_mm(NULL, mm, NULL); + __switch_mm(mm); + + if (system_uses_ttbr0_pan()) { + if (mm != current->active_mm) { + /* + * Update the current thread's saved ttbr0 since it is + * restored as part of a return from exception. Set + * the hardware TTBR0_EL1 using cpu_switch_mm() + * directly to enable potential errata workarounds. + */ + update_saved_ttbr0(current, mm); + cpu_switch_mm(mm->pgd, mm); + } else { + /* + * Defer the switch to the current thread's TTBR0_EL1 + * until uaccess_enable(). Restore the current + * thread's saved ttbr0 corresponding to its active_mm + * (if different from init_mm). + */ + cpu_set_reserved_ttbr0(); + if (current->active_mm != &init_mm) + update_saved_ttbr0(current, current->active_mm); + } + } } void efi_virtmap_load(void); diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index a00f7cf35bbd..4a32fd5f101d 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -113,7 +114,7 @@ static inline void cpu_uninstall_idmap(void) local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); - if (mm != &init_mm) + if (mm != &init_mm && !system_uses_ttbr0_pan()) cpu_switch_mm(mm->pgd, mm); } @@ -173,21 +174,27 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { } -/* - * This is the actual mm switch as far as the scheduler - * is concerned. No registers are touched. We avoid - * calling the CPU specific function when the mm hasn't - * actually changed. - */ -static inline void -switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) +{ + if (system_uses_ttbr0_pan()) { + BUG_ON(mm->pgd == swapper_pg_dir); + task_thread_info(tsk)->ttbr0 = + virt_to_phys(mm->pgd) | ASID(mm) << 48; + } +} +#else +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) +{ +} +#endif + +static inline void __switch_mm(struct mm_struct *next) { unsigned int cpu = smp_processor_id(); - if (prev == next) - return; - /* * init_mm.pgd does not contain any user mappings and it is always * active for kernel addresses in TTBR1. Just set the reserved TTBR0. @@ -200,7 +207,23 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, check_and_switch_context(next, cpu); } +static inline void +switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + if (prev != next) + __switch_mm(next); + + /* + * Update the saved TTBR0_EL1 of the scheduled-in task as the previous + * value may have not been initialised yet (activate_mm caller) or the + * ASID has changed since the last run (following the context switch + * of another thread of the same process). + */ + update_saved_ttbr0(tsk, next); +} + #define deactivate_mm(tsk,mm) do { } while (0) -#define activate_mm(prev,next) switch_mm(prev, next, NULL) +#define activate_mm(prev,next) switch_mm(prev, next, current) #endif diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 200d5c32b38f..dd4257e286b1 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -21,6 +21,8 @@ #include +#define _PSR_PAN_BIT 22 + /* Current Exception Level values, as contained in CurrentEL */ #define CurrentEL_EL1 (1 << 2) #define CurrentEL_EL2 (2 << 2) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8eb8eb085036..eb185c93dfc6 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,7 +29,9 @@ #include #include #include +#include #include +#include #include /* @@ -109,6 +111,34 @@ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +alternative_if_not ARM64_HAS_PAN + nop +alternative_else + b 1f // skip TTBR0 PAN +alternative_endif + + .if \el != 0 + mrs x21, ttbr0_el1 + tst x21, #0xffff << 48 // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR + .endif + + uaccess_ttbr0_disable x21 +1: +#endif + stp x22, x23, [sp, #S_PC] /* @@ -147,6 +177,42 @@ ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter + .endif + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. + */ +alternative_if_not ARM64_HAS_PAN + nop +alternative_else + b 2f // skip TTBR0 PAN +alternative_endif + + .if \el != 0 + tbnz x22, #_PSR_PAN_BIT, 1f // Skip re-enabling TTBR0 access if previously disabled + .endif + + uaccess_ttbr0_enable x0 + + .if \el == 0 + /* + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). + */ + post_ttbr0_update_workaround + .endif +1: + .if \el != 0 + and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + .endif +2: +#endif + + .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 #ifdef CONFIG_ARM64_ERRATUM_845719 @@ -168,6 +234,7 @@ alternative_else alternative_endif #endif .endif + msr elr_el1, x21 // set up the return data msr spsr_el1, x22 ldp x0, x1, [sp, #16 * 0] diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 29b8c247d56f..6591bf23422b 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -347,6 +347,15 @@ void __init setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Make sure init_thread_info.ttbr0 always generates translation + * faults in case uaccess_enable() is inadvertently called by the init + * thread. + */ + init_thread_info.ttbr0 = virt_to_phys(empty_zero_page); +#endif + #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 7275628ba59f..25128089c386 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -182,7 +182,12 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: - cpu_switch_mm(mm->pgd, mm); + /* + * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when + * emulating PAN. + */ + if (!system_uses_ttbr0_pan()) + cpu_switch_mm(mm->pgd, mm); } static int asids_init(void) From 5dc2b7c7bb33138270ff9494be6cf334bd3d20e1 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 18:22:39 +0100 Subject: [PATCH 0496/3220] FROMLIST: arm64: Handle faults caused by inadvertent user access with PAN enabled When TTBR0_EL1 is set to the reserved page, an erroneous kernel access to user space would generate a translation fault. This patch adds the checks for the software-set PSR_PAN_BIT to emulate a permission fault and report it accordingly. This patch also updates the description of the synchronous external aborts on translation table walks. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: I623113fc8bf6d5f023aeec7a0640b62a25ef8420 (cherry picked from commit be3db9340c8011d22f06715339b66bcbbd4893bd) Signed-off-by: Sami Tolvanen --- arch/arm64/mm/fault.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index e328430dbcec..f0c75fd6a3fa 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -246,13 +246,19 @@ out: return fault; } -static inline bool is_permission_fault(unsigned int esr) +static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs) { unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) || - (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM); + if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) + return false; + + if (system_uses_ttbr0_pan()) + return fsc_type == ESR_ELx_FSC_FAULT && + (regs->pstate & PSR_PAN_BIT); + else + return fsc_type == ESR_ELx_FSC_PERM; } static bool is_el0_instruction_abort(unsigned int esr) @@ -293,7 +299,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (is_permission_fault(esr) && (addr < USER_DS)) { + if (addr < USER_DS && is_permission_fault(esr, regs)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); @@ -478,10 +484,10 @@ static const struct fault_info { { do_bad, SIGBUS, 0, "unknown 17" }, { do_bad, SIGBUS, 0, "unknown 18" }, { do_bad, SIGBUS, 0, "unknown 19" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, { do_bad, SIGBUS, 0, "synchronous parity error" }, { do_bad, SIGBUS, 0, "unknown 25" }, { do_bad, SIGBUS, 0, "unknown 26" }, From 4dbc88bd2b6a74fd33483ee2593dcf2bd858eabe Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 5 Jul 2016 12:25:15 +0100 Subject: [PATCH 0497/3220] FROMLIST: arm64: xen: Enable user access before a privcmd hvc call Privcmd calls are issued by the userspace. The kernel needs to enable access to TTBR0_EL1 as the hypervisor would issue stage 1 translations to user memory via AT instructions. Since AT instructions are not affected by the PAN bit (ARMv8.1), we only need the explicit uaccess_enable/disable if the TTBR0 PAN option is enabled. Reviewed-by: Julien Grall Acked-by: Stefano Stabellini Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: I927f14076ba94c83e609b19f46dd373287e11fc4 (cherry picked from commit 8cc1f33d2c9f206b6505bedba41aed2b33c203c0) Signed-off-by: Sami Tolvanen --- arch/arm64/xen/hypercall.S | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 8bbe9401f4f0..6d6e4af1a4bf 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -49,6 +49,7 @@ #include #include +#include #include @@ -89,6 +90,24 @@ ENTRY(privcmd_call) mov x2, x3 mov x3, x4 mov x4, x5 +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Privcmd calls are issued by the userspace. The kernel needs to + * enable access to TTBR0_EL1 as the hypervisor would issue stage 1 + * translations to user memory via AT instructions. Since AT + * instructions are not affected by the PAN bit (ARMv8.1), we only + * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation + * is enabled (it implies that hardware UAO and PAN disabled). + */ + uaccess_enable_not_uao x6, x7 +#endif hvc XEN_IMM + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Disable userspace access from kernel once the hyp call completed. + */ + uaccess_disable_not_uao x6 +#endif ret ENDPROC(privcmd_call); From 67cd3bda54dadba4f8892105adf9c2f3982bfa0a Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 18:25:31 +0100 Subject: [PATCH 0498/3220] FROMLIST: arm64: Enable CONFIG_ARM64_SW_TTBR0_PAN This patch adds the Kconfig option to enable support for TTBR0 PAN emulation. The option is default off because of a slight performance hit when enabled, caused by the additional TTBR0_EL1 switching during user access operations or exception entry/exit code. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Change-Id: Id00a8ad4169d6eb6176c468d953436eb4ae887ae (cherry picked from commit 6a2d7bad43474c48b68394d455b84a16b7d7dc3f) Signed-off-by: Sami Tolvanen --- arch/arm64/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index bf24d67a1ff1..488fed2d5e5e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -706,6 +706,14 @@ config SETEND_EMULATION If unsure, say Y endif +config ARM64_SW_TTBR0_PAN + bool "Emulate Priviledged Access Never using TTBR0_EL1 switching" + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved + zeroed area and reserved ASID. The user access routines + restore the valid TTBR0_EL1 temporarily. + menu "ARMv8.1 architectural features" config ARM64_HW_AFDBM From d625b6286ac7af9bc19811c1a625867a71c631bf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 12 Jan 2016 14:22:45 +0100 Subject: [PATCH 0499/3220] UPSTREAM: ia64: split off early_ioremap() declarations into asm/early_ioremap.h Unlike x86, arm64 and ARM, ia64 does not declare its implementations of early_ioremap/early_iounmap/early_memremap/early_memunmap in a header file called This complicates the use of these functions in generic code, since the header cannot be included directly, and we have to rely on transitive includes, which is fragile. So create a for ia64, and move the existing definitions into it. Signed-off-by: Ard Biesheuvel Signed-off-by: Tony Luck Change-Id: I31eb55a9d57596faa40aec64bd26ce3ec21b0b4d (cherry picked from commit 809267708557ed5575831282f719ca644698084b) Signed-off-by: Sami Tolvanen --- arch/ia64/include/asm/early_ioremap.h | 10 ++++++++++ arch/ia64/include/asm/io.h | 5 +---- 2 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 arch/ia64/include/asm/early_ioremap.h diff --git a/arch/ia64/include/asm/early_ioremap.h b/arch/ia64/include/asm/early_ioremap.h new file mode 100644 index 000000000000..eec9e1d1b833 --- /dev/null +++ b/arch/ia64/include/asm/early_ioremap.h @@ -0,0 +1,10 @@ +#ifndef _ASM_IA64_EARLY_IOREMAP_H +#define _ASM_IA64_EARLY_IOREMAP_H + +extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size); +#define early_memremap(phys_addr, size) early_ioremap(phys_addr, size) + +extern void early_iounmap (volatile void __iomem *addr, unsigned long size); +#define early_memunmap(addr, size) early_iounmap(addr, size) + +#endif diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index 9041bbe2b7b4..a865d2a04f75 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -20,6 +20,7 @@ */ #include +#include /* We don't use IO slowdowns on the ia64, but.. */ #define __SLOW_DOWN_IO do { } while (0) @@ -427,10 +428,6 @@ __writeq (unsigned long val, volatile void __iomem *addr) extern void __iomem * ioremap(unsigned long offset, unsigned long size); extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); extern void iounmap (volatile void __iomem *addr); -extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size); -#define early_memremap(phys_addr, size) early_ioremap(phys_addr, size) -extern void early_iounmap (volatile void __iomem *addr, unsigned long size); -#define early_memunmap(addr, size) early_iounmap(addr, size) static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned long size) { return ioremap(phys_addr, size); From a92cff09551d9ef0605af1cf6a7d9ee087d036c3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 12 Jan 2016 14:22:46 +0100 Subject: [PATCH 0500/3220] UPSTREAM: efi: include asm/early_ioremap.h not asm/efi.h to get early_memremap The code in efi.c uses early_memremap(), but relies on a transitive include rather than including asm/early_ioremap.h directly, since this header did not exist on ia64. Commit f7d924894265 ("arm64/efi: refactor EFI init and runtime code for reuse by 32-bit ARM") attempted to work around this by including asm/efi.h, which transitively includes asm/early_ioremap.h on most architectures. However, since asm/efi.h does not exist on ia64 either, this is not much of an improvement. Now that we have created an asm/early_ioremap.h for ia64, we can just include it directly. Reported-by: Guenter Roeck Signed-off-by: Ard Biesheuvel Signed-off-by: Tony Luck Change-Id: Ifa3e69e0b4078bac1e1d29bfe56861eb394e865b (cherry picked from commit 0f7f2f0c0fcbe5e2bcba707a628ebaedfe2be4b4) Signed-off-by: Sami Tolvanen --- drivers/firmware/efi/efi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index cffa89b3317b..2cd37dad67a6 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -25,7 +25,7 @@ #include #include -#include +#include struct efi __read_mostly efi = { .mps = EFI_INVALID_TABLE_ADDR, From 8fd31d5a2c06726a6060ea355bb5c90d44b132c4 Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Mon, 3 Oct 2016 16:17:34 -0700 Subject: [PATCH 0501/3220] Fix a build breakage in IO latency hist code. Fix a build breakage where MMC is enabled, but BLOCK is not. Change-Id: I0eb422d12264f0371f3368ae7c37342ef9efabaa Signed-off-by: Mohan Srinivasan --- drivers/mmc/core/core.c | 6 ++++++ drivers/mmc/core/host.c | 4 ++++ include/linux/mmc/core.h | 2 ++ 3 files changed, 12 insertions(+) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index b503249950df..f9d928cbc214 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -183,6 +183,7 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) pr_debug("%s: %d bytes transferred: %d\n", mmc_hostname(host), mrq->data->bytes_xfered, mrq->data->error); +#ifdef CONFIG_BLOCK if (mrq->lat_hist_enabled) { ktime_t completion; u_int64_t delta_us; @@ -194,6 +195,7 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) (mrq->data->flags & MMC_DATA_READ), delta_us); } +#endif trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data); } @@ -638,11 +640,13 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host, } if (!err && areq) { +#ifdef CONFIG_BLOCK if (host->latency_hist_enabled) { areq->mrq->io_start = ktime_get(); areq->mrq->lat_hist_enabled = 1; } else areq->mrq->lat_hist_enabled = 0; +#endif trace_mmc_blk_rw_start(areq->mrq->cmd->opcode, areq->mrq->cmd->arg, areq->mrq->data); @@ -2923,6 +2927,7 @@ static void __exit mmc_exit(void) destroy_workqueue(workqueue); } +#ifdef CONFIG_BLOCK static ssize_t latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -2970,6 +2975,7 @@ mmc_latency_hist_sysfs_exit(struct mmc_host *host) { device_remove_file(&host->class_dev, &dev_attr_latency_hist); } +#endif subsys_initcall(mmc_init); module_exit(mmc_exit); diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 17068839c74b..443fdfc22d8a 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -392,7 +392,9 @@ int mmc_add_host(struct mmc_host *host) mmc_add_host_debugfs(host); #endif +#ifdef CONFIG_BLOCK mmc_latency_hist_sysfs_init(host); +#endif mmc_start_host(host); if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY)) @@ -422,7 +424,9 @@ void mmc_remove_host(struct mmc_host *host) mmc_remove_host_debugfs(host); #endif +#ifdef CONFIG_BLOCK mmc_latency_hist_sysfs_exit(host); +#endif device_del(&host->class_dev); diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 3349f0676acb..0860efd6e1be 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -137,7 +137,9 @@ struct mmc_request { void (*done)(struct mmc_request *);/* completion function */ struct mmc_host *host; ktime_t io_start; +#ifdef CONFIG_BLOCK int lat_hist_enabled; +#endif }; struct mmc_card; From 1a6c07cf6f2671e1a8a561745bdcc295a7cfeb17 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Fri, 19 Feb 2016 11:50:32 -0600 Subject: [PATCH 0502/3220] BACKPORT: arm64: mm: Mark .rodata as RO Currently the .rodata section is actually still executable when DEBUG_RODATA is enabled. This changes that so the .rodata is actually read only, no execute. It also adds the .rodata section to the mem_init banner. Signed-off-by: Jeremy Linton Reviewed-by: Kees Cook Acked-by: Mark Rutland [catalin.marinas@arm.com: added vm_struct vmlinux_rodata in map_kernel()] Signed-off-by: Catalin Marinas Bug: 31374226 Change-Id: I6fd95beaf814fc91805da12c5329a57ce9008fd7 (cherry picked from commit 2f39b5f91eb4bccd786d194e70db1dccad784755) Signed-off-by: Sami Tolvanen --- arch/arm64/kernel/vmlinux.lds.S | 6 +++--- arch/arm64/mm/init.c | 2 ++ arch/arm64/mm/mmu.c | 25 ++++++++++++++++--------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 87fd0556a1bd..f1d6c49dcc5f 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -114,14 +114,14 @@ SECTIONS *(.got) /* Global offset table */ } + ALIGN_DEBUG_RO_MIN(PAGE_SIZE) _etext = .; /* End of text section */ - RO_DATA(PAGE_SIZE) - EXCEPTION_TABLE(8) + RO_DATA(PAGE_SIZE) /* everything from this point to */ + EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */ NOTES ALIGN_DEBUG_RO_MIN(PAGE_SIZE) - _etext = .; /* End of text and rodata section */ __init_begin = .; INIT_TEXT_SECTION(8) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0db12e9e8765..678878077996 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -376,6 +376,7 @@ void __init mem_init(void) " vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n" " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" + " .rodata : 0x%p" " - 0x%p" " (%6ld KB)\n" " .data : 0x%p" " - 0x%p" " (%6ld KB)\n" #ifdef CONFIG_SPARSEMEM_VMEMMAP " vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n" @@ -391,6 +392,7 @@ void __init mem_init(void) MLG(VMALLOC_START, VMALLOC_END), MLK_ROUNDUP(__init_begin, __init_end), MLK_ROUNDUP(_text, _etext), + MLK_ROUNDUP(__start_rodata, __init_begin), MLK_ROUNDUP(_sdata, _edata), #ifdef CONFIG_SPARSEMEM_VMEMMAP MLG(VMEMMAP_START, diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9dafcfde7c26..4b38f6b8806e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -386,14 +386,14 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { unsigned long kernel_start = __pa(_stext); - unsigned long kernel_end = __pa(_etext); + unsigned long kernel_end = __pa(__init_begin); /* * Take care not to create a writable alias for the * read-only text and rodata sections of the kernel image. */ - /* No overlap with the kernel text */ + /* No overlap with the kernel text/rodata */ if (end < kernel_start || start >= kernel_end) { __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, @@ -402,7 +402,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end } /* - * This block overlaps the kernel text mapping. + * This block overlaps the kernel text/rodata mapping. * Map the portion(s) which don't overlap. */ if (start < kernel_start) @@ -417,7 +417,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end early_pgtable_alloc); /* - * Map the linear alias of the [_stext, _etext) interval as + * Map the linear alias of the [_stext, __init_begin) interval as * read-only/non-executable. This makes the contents of the * region accessible to subsystems such as hibernate, but * protects it from inadvertent modification or execution. @@ -447,12 +447,18 @@ static void __init map_mem(pgd_t *pgd) void mark_rodata_ro(void) { - if (!IS_ENABLED(CONFIG_DEBUG_RODATA)) - return; + unsigned long section_size; + section_size = (unsigned long)_etext - (unsigned long)_stext; create_mapping_late(__pa(_stext), (unsigned long)_stext, - (unsigned long)__init_begin - (unsigned long)_stext, - PAGE_KERNEL_ROX); + section_size, PAGE_KERNEL_ROX); + /* + * mark .rodata as read only. Use __init_begin rather than __end_rodata + * to cover NOTES and EXCEPTION_TABLE. + */ + section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; + create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata, + section_size, PAGE_KERNEL_RO); } void fixup_init(void) @@ -491,9 +497,10 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, */ static void __init map_kernel(pgd_t *pgd) { - static struct vm_struct vmlinux_text, vmlinux_init, vmlinux_data; + static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data; map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); + map_kernel_chunk(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata); map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, &vmlinux_init); map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); From 50bad62b51e802b5bd5e731915234905fbe4d4f6 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Wed, 21 Sep 2016 15:25:04 -0700 Subject: [PATCH 0503/3220] BACKPORT: arm64: Correctly bounds check virt_addr_valid virt_addr_valid is supposed to return true if and only if virt_to_page returns a valid page structure. The current macro does math on whatever address is given and passes that to pfn_valid to verify. vmalloc and module addresses can happen to generate a pfn that 'happens' to be valid. Fix this by only performing the pfn_valid check on addresses that have the potential to be valid. Acked-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Will Deacon Bug: 31374226 Change-Id: I75cbeb3edb059f19af992b7f5d0baa283f95991b (cherry picked from commit ca219452c6b8a6cd1369b6a78b1cf069d0386865) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/memory.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 12f8a00fb3f1..ba1b3409d7ed 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -193,7 +193,11 @@ static inline void *phys_to_virt(phys_addr_t x) #define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define _virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) + +#define _virt_addr_is_linear(kaddr) (((u64)(kaddr)) >= PAGE_OFFSET) +#define virt_addr_valid(kaddr) (_virt_addr_is_linear(kaddr) && \ + _virt_addr_valid(kaddr)) #endif From 762a327de1b8dc48f813c7d397f7b627cff215c5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 8 Sep 2016 09:57:07 +0200 Subject: [PATCH 0504/3220] UPSTREAM: fs/proc/kcore.c: Make bounce buffer global for read Next patch adds bounce buffer for ktext area, so it's convenient to have single bounce buffer for both vmalloc/module and ktext cases. Suggested-by: Linus Torvalds Signed-off-by: Jiri Olsa Acked-by: Kees Cook Signed-off-by: Linus Torvalds Bug: 31374226 Change-Id: I8f517354e6d12aed75ed4ae6c0a6adef0a1e61da (cherry picked from commit f5beeb1851ea6f8cfcf2657f26cb24c0582b4945) Signed-off-by: Sami Tolvanen --- fs/proc/kcore.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 92e6726f6e37..2dc84040b0cb 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) static ssize_t read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { + char *buf = file->private_data; ssize_t acc = 0; size_t size, tsz; size_t elf_buflen; @@ -500,18 +501,10 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (clear_user(buffer, tsz)) return -EFAULT; } else if (is_vmalloc_or_module_addr((void *)start)) { - char * elf_buf; - - elf_buf = kzalloc(tsz, GFP_KERNEL); - if (!elf_buf) - return -ENOMEM; - vread(elf_buf, (char *)start, tsz); + vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, elf_buf, tsz)) { - kfree(elf_buf); + if (copy_to_user(buffer, buf, tsz)) return -EFAULT; - } - kfree(elf_buf); } else { if (kern_addr_valid(start)) { unsigned long n; @@ -549,6 +542,11 @@ static int open_kcore(struct inode *inode, struct file *filp) { if (!capable(CAP_SYS_RAWIO)) return -EPERM; + + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!filp->private_data) + return -ENOMEM; + if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -559,10 +557,16 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } +static int release_kcore(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} static const struct file_operations proc_kcore_operations = { .read = read_kcore, .open = open_kcore, + .release = release_kcore, .llseek = default_llseek, }; From c7642969a507168ab9d4a5d4715030af5d2a11a7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 8 Sep 2016 09:57:08 +0200 Subject: [PATCH 0505/3220] UPSTREAM: fs/proc/kcore.c: Add bounce buffer for ktext data We hit hardened usercopy feature check for kernel text access by reading kcore file: usercopy: kernel memory exposure attempt detected from ffffffff8179a01f () (4065 bytes) kernel BUG at mm/usercopy.c:75! Bypassing this check for kcore by adding bounce buffer for ktext data. Reported-by: Steve Best Fixes: f5509cc18daa ("mm: Hardened usercopy") Suggested-by: Kees Cook Signed-off-by: Jiri Olsa Acked-by: Kees Cook Signed-off-by: Linus Torvalds Bug: 31374226 Change-Id: Ic93e6041b67d804a994518bf4950811f828b406e (cherry picked from commit df04abfd181acc276ba6762c8206891ae10ae00d) Signed-off-by: Sami Tolvanen --- fs/proc/kcore.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 2dc84040b0cb..21f198aa0961 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -509,7 +509,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (kern_addr_valid(start)) { unsigned long n; - n = copy_to_user(buffer, (char *)start, tsz); + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + memcpy(buf, (char *) start, tsz); + n = copy_to_user(buffer, buf, tsz); /* * We cannot distinguish between fault on source * and fault on destination. When this happens From d02eecfab2e3b76185f396a1e3dad004b6962696 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 5 Oct 2016 09:52:07 -0700 Subject: [PATCH 0506/3220] ANDROID: android-base: CONFIG_HARDENED_USERCOPY=y Bug: 31374226 Change-Id: I977e76395017d8d718ea634421b3635023934ef9 Signed-off-by: Sami Tolvanen --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 1ebef5c69fb6..8531a7a79e33 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -24,6 +24,7 @@ CONFIG_DM_VERITY=y CONFIG_DM_VERITY_FEC=y CONFIG_EMBEDDED=y CONFIG_FB=y +CONFIG_HARDENED_USERCOPY=y CONFIG_HIGH_RES_TIMERS=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y From 620ed669ae0bca79885abb5a3c242ac5bf000f3b Mon Sep 17 00:00:00 2001 From: EunTaik Lee Date: Wed, 24 Feb 2016 04:38:06 +0000 Subject: [PATCH 0507/3220] UPSTREAM: staging/android/ion : fix a race condition in the ion driver There is a use-after-free problem in the ion driver. This is caused by a race condition in the ion_ioctl() function. A handle has ref count of 1 and two tasks on different cpus calls ION_IOC_FREE simultaneously. cpu 0 cpu 1 ------------------------------------------------------- ion_handle_get_by_id() (ref == 2) ion_handle_get_by_id() (ref == 3) ion_free() (ref == 2) ion_handle_put() (ref == 1) ion_free() (ref == 0 so ion_handle_destroy() is called and the handle is freed.) ion_handle_put() is called and it decreases the slub's next free pointer The problem is detected as an unaligned access in the spin lock functions since it uses load exclusive instruction. In some cases it corrupts the slub's free pointer which causes a mis-aligned access to the next free pointer.(kmalloc returns a pointer like ffffc0745b4580aa). And it causes lots of other hard-to-debug problems. This symptom is caused since the first member in the ion_handle structure is the reference count and the ion driver decrements the reference after it has been freed. To fix this problem client->lock mutex is extended to protect all the codes that uses the handle. Signed-off-by: Eun Taik Lee Reviewed-by: Laura Abbott Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 9590232bb4f4cc824f3425a6e1349afbe6d6d2b7) bug: 31568617 Change-Id: I4ea2be0cad3305c4e196126a02e2ab7108ef0976 --- drivers/staging/android/ion/ion.c | 59 +++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index e237e9f3312d..1958d58fdca3 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -385,13 +385,22 @@ static void ion_handle_get(struct ion_handle *handle) kref_get(&handle->ref); } -static int ion_handle_put(struct ion_handle *handle) +static int ion_handle_put_nolock(struct ion_handle *handle) +{ + int ret; + + ret = kref_put(&handle->ref, ion_handle_destroy); + + return ret; +} + +int ion_handle_put(struct ion_handle *handle) { struct ion_client *client = handle->client; int ret; mutex_lock(&client->lock); - ret = kref_put(&handle->ref, ion_handle_destroy); + ret = ion_handle_put_nolock(handle); mutex_unlock(&client->lock); return ret; @@ -415,18 +424,28 @@ static struct ion_handle *ion_handle_lookup(struct ion_client *client, return ERR_PTR(-EINVAL); } -static struct ion_handle *ion_handle_get_by_id(struct ion_client *client, +static struct ion_handle *ion_handle_get_by_id_nolock(struct ion_client *client, + int id) +{ + struct ion_handle *handle; + + handle = idr_find(&client->idr, id); + if (handle) + ion_handle_get(handle); + + return handle ? handle : ERR_PTR(-EINVAL); +} + +struct ion_handle *ion_handle_get_by_id(struct ion_client *client, int id) { struct ion_handle *handle; mutex_lock(&client->lock); - handle = idr_find(&client->idr, id); - if (handle) - ion_handle_get(handle); + handle = ion_handle_get_by_id_nolock(client, id); mutex_unlock(&client->lock); - return handle ? handle : ERR_PTR(-EINVAL); + return handle; } static bool ion_handle_validate(struct ion_client *client, @@ -530,22 +549,28 @@ struct ion_handle *ion_alloc(struct ion_client *client, size_t len, } EXPORT_SYMBOL(ion_alloc); -void ion_free(struct ion_client *client, struct ion_handle *handle) +static void ion_free_nolock(struct ion_client *client, struct ion_handle *handle) { bool valid_handle; BUG_ON(client != handle->client); - mutex_lock(&client->lock); valid_handle = ion_handle_validate(client, handle); if (!valid_handle) { WARN(1, "%s: invalid handle passed to free.\n", __func__); - mutex_unlock(&client->lock); return; } + ion_handle_put_nolock(handle); +} + +void ion_free(struct ion_client *client, struct ion_handle *handle) +{ + BUG_ON(client != handle->client); + + mutex_lock(&client->lock); + ion_free_nolock(client, handle); mutex_unlock(&client->lock); - ion_handle_put(handle); } EXPORT_SYMBOL(ion_free); @@ -1281,11 +1306,15 @@ static long ion_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct ion_handle *handle; - handle = ion_handle_get_by_id(client, data.handle.handle); - if (IS_ERR(handle)) + mutex_lock(&client->lock); + handle = ion_handle_get_by_id_nolock(client, data.handle.handle); + if (IS_ERR(handle)) { + mutex_unlock(&client->lock); return PTR_ERR(handle); - ion_free(client, handle); - ion_handle_put(handle); + } + ion_free_nolock(client, handle); + ion_handle_put_nolock(handle); + mutex_unlock(&client->lock); break; } case ION_IOC_SHARE: From b81f4c5f3167142b5b0b91b474658a850e8c5450 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arve=20Hj=C3=B8nnev=C3=A5g?= Date: Tue, 2 Aug 2016 15:40:39 -0700 Subject: [PATCH 0508/3220] ANDROID: binder: Add strong ref checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevent using a binder_ref with only weak references where a strong reference is required. BUG: 30445380 Change-Id: I66c15b066808f28bd27bfe50fd0e03ff45a09fca Signed-off-by: Arve Hjønnevåg --- drivers/android/binder.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index f0ce9959d14d..ce84ff14ce9b 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1003,7 +1003,7 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal) static struct binder_ref *binder_get_ref(struct binder_proc *proc, - uint32_t desc) + uint32_t desc, bool need_strong_ref) { struct rb_node *n = proc->refs_by_desc.rb_node; struct binder_ref *ref; @@ -1011,12 +1011,16 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc, while (n) { ref = rb_entry(n, struct binder_ref, rb_node_desc); - if (desc < ref->desc) + if (desc < ref->desc) { n = n->rb_left; - else if (desc > ref->desc) + } else if (desc > ref->desc) { n = n->rb_right; - else + } else if (need_strong_ref && !ref->strong) { + binder_user_error("tried to use weak ref as strong ref\n"); + return NULL; + } else { return ref; + } } return NULL; } @@ -1286,7 +1290,8 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, fp->handle); + struct binder_ref *ref = binder_get_ref(proc, fp->handle, + fp->type == BINDER_TYPE_HANDLE); if (ref == NULL) { pr_err("transaction release %d bad handle %d\n", @@ -1381,7 +1386,7 @@ static void binder_transaction(struct binder_proc *proc, if (tr->target.handle) { struct binder_ref *ref; - ref = binder_get_ref(proc, tr->target.handle); + ref = binder_get_ref(proc, tr->target.handle, true); if (ref == NULL) { binder_user_error("%d:%d got transaction to invalid handle\n", proc->pid, thread->pid); @@ -1590,7 +1595,8 @@ static void binder_transaction(struct binder_proc *proc, } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, fp->handle); + struct binder_ref *ref = binder_get_ref(proc, fp->handle, + fp->type == BINDER_TYPE_HANDLE); if (ref == NULL) { binder_user_error("%d:%d got transaction with invalid handle, %d\n", @@ -1801,7 +1807,9 @@ static int binder_thread_write(struct binder_proc *proc, ref->desc); } } else - ref = binder_get_ref(proc, target); + ref = binder_get_ref(proc, target, + cmd == BC_ACQUIRE || + cmd == BC_RELEASE); if (ref == NULL) { binder_user_error("%d:%d refcount change on invalid ref %d\n", proc->pid, thread->pid, target); @@ -1997,7 +2005,7 @@ static int binder_thread_write(struct binder_proc *proc, if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); - ref = binder_get_ref(proc, target); + ref = binder_get_ref(proc, target, false); if (ref == NULL) { binder_user_error("%d:%d %s invalid ref %d\n", proc->pid, thread->pid, From 78c26bebd182874a7383a5d07446673b61d1ea0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arve=20Hj=C3=B8nnev=C3=A5g?= Date: Fri, 12 Aug 2016 16:04:28 -0700 Subject: [PATCH 0509/3220] ANDROID: binder: Clear binder and cookie when setting handle in flat binder struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevents leaking pointers between processes BUG: 30768347 Change-Id: Id898076926f658a1b8b27a3ccb848756b36de4ca Signed-off-by: Arve Hjønnevåg --- drivers/android/binder.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index ce84ff14ce9b..b8ba5b7516b9 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1583,7 +1583,9 @@ static void binder_transaction(struct binder_proc *proc, fp->type = BINDER_TYPE_HANDLE; else fp->type = BINDER_TYPE_WEAK_HANDLE; + fp->binder = 0; fp->handle = ref->desc; + fp->cookie = 0; binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE, &thread->todo); @@ -1631,7 +1633,9 @@ static void binder_transaction(struct binder_proc *proc, return_error = BR_FAILED_REPLY; goto err_binder_get_ref_for_node_failed; } + fp->binder = 0; fp->handle = new_ref->desc; + fp->cookie = 0; binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL); trace_binder_transaction_ref_to_ref(t, ref, new_ref); @@ -1685,6 +1689,7 @@ static void binder_transaction(struct binder_proc *proc, binder_debug(BINDER_DEBUG_TRANSACTION, " fd %d -> %d\n", fp->handle, target_fd); /* TODO: fput? */ + fp->binder = 0; fp->handle = target_fd; } break; From 077d045666866063087a1e71e5e82ede3e139d96 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 May 2016 11:48:25 -0400 Subject: [PATCH 0510/3220] UPSTREAM: percpu: fix synchronization between chunk->map_extend_work and chunk destruction (cherry picked from commit 4f996e234dad488e5d9ba0858bc1bae12eff82c3) Atomic allocations can trigger async map extensions which is serviced by chunk->map_extend_work. pcpu_balance_work which is responsible for destroying idle chunks wasn't synchronizing properly against chunk->map_extend_work and may end up freeing the chunk while the work item is still in flight. This patch fixes the bug by rolling async map extension operations into pcpu_balance_work. Signed-off-by: Tejun Heo Reported-and-tested-by: Alexei Starovoitov Reported-by: Vlastimil Babka Reported-by: Sasha Levin Cc: stable@vger.kernel.org # v3.18+ Fixes: 9c824b6a172c ("percpu: make sure chunk->map array has available space") Change-Id: I8f4aaf7fe0bc0e9f353d41e0a7840c40d6a32117 Bug: 31596597 --- mm/percpu.c | 57 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 8a943b97a053..58b014900f0f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -110,7 +110,7 @@ struct pcpu_chunk { int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ - struct work_struct map_extend_work;/* async ->map[] extension */ + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ void *data; /* chunk data */ int first_free; /* no free below this */ @@ -164,6 +164,9 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* chunks which need their map areas extended, protected by pcpu_lock */ +static LIST_HEAD(pcpu_map_extend_chunks); + /* * The number of empty populated pages, protected by pcpu_lock. The * reserved chunk doesn't contribute to the count. @@ -397,13 +400,19 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { int margin, new_alloc; + lockdep_assert_held(&pcpu_lock); + if (is_atomic) { margin = 3; if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && - pcpu_async_enabled) - schedule_work(&chunk->map_extend_work); + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) { + if (list_empty(&chunk->map_extend_list)) { + list_add_tail(&chunk->map_extend_list, + &pcpu_map_extend_chunks); + pcpu_schedule_balance_work(); + } + } } else { margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; } @@ -469,20 +478,6 @@ out_unlock: return 0; } -static void pcpu_map_extend_workfn(struct work_struct *work) -{ - struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, - map_extend_work); - int new_alloc; - - spin_lock_irq(&pcpu_lock); - new_alloc = pcpu_need_to_extend(chunk, false); - spin_unlock_irq(&pcpu_lock); - - if (new_alloc) - pcpu_extend_area_map(chunk, new_alloc); -} - /** * pcpu_fit_in_area - try to fit the requested allocation in a candidate area * @chunk: chunk the candidate area belongs to @@ -742,7 +737,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); - INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&chunk->map_extend_list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -1131,6 +1126,7 @@ static void pcpu_balance_workfn(struct work_struct *work) if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; + list_del_init(&chunk->map_extend_list); list_move(&chunk->list, &to_free); } @@ -1148,6 +1144,25 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } + /* service chunks which requested async area map extension */ + do { + int new_alloc = 0; + + spin_lock_irq(&pcpu_lock); + + chunk = list_first_entry_or_null(&pcpu_map_extend_chunks, + struct pcpu_chunk, map_extend_list); + if (chunk) { + list_del_init(&chunk->map_extend_list); + new_alloc = pcpu_need_to_extend(chunk, false); + } + + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); + } while (chunk); + /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic @@ -1646,7 +1661,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); - INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&schunk->map_extend_list); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); @@ -1675,7 +1690,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); - INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&dchunk->map_extend_list); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); From 3b41e21ffaa38ff8ed8fa8b71f4410f45f42011f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 May 2016 11:48:25 -0400 Subject: [PATCH 0511/3220] UPSTREAM: percpu: fix synchronization between synchronous map extension and chunk destruction (cherry picked from commit 6710e594f71ccaad8101bc64321152af7cd9ea28) For non-atomic allocations, pcpu_alloc() can try to extend the area map synchronously after dropping pcpu_lock; however, the extension wasn't synchronized against chunk destruction and the chunk might get freed while extension is in progress. This patch fixes the bug by putting most of non-atomic allocations under pcpu_alloc_mutex to synchronize against pcpu_balance_work which is responsible for async chunk management including destruction. Signed-off-by: Tejun Heo Reported-and-tested-by: Alexei Starovoitov Reported-by: Vlastimil Babka Reported-by: Sasha Levin Cc: stable@vger.kernel.org # v3.18+ Fixes: 1a4d76076cda ("percpu: implement asynchronous chunk population") Change-Id: I8800962e658e78eac866fff4a4e00294c58a3dec Bug: 31596597 --- mm/percpu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 58b014900f0f..1f376bce413c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -160,7 +160,7 @@ static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ @@ -446,6 +446,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); unsigned long flags; + lockdep_assert_held(&pcpu_alloc_mutex); + new = pcpu_mem_zalloc(new_size); if (!new) return -ENOMEM; @@ -892,6 +894,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, return NULL; } + if (!is_atomic) + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -964,12 +969,9 @@ restart: if (is_atomic) goto fail; - mutex_lock(&pcpu_alloc_mutex); - if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(); if (!chunk) { - mutex_unlock(&pcpu_alloc_mutex); err = "failed to allocate new chunk"; goto fail; } @@ -980,7 +982,6 @@ restart: spin_lock_irqsave(&pcpu_lock, flags); } - mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: @@ -990,8 +991,6 @@ area_found: if (!is_atomic) { int page_start, page_end, rs, re; - mutex_lock(&pcpu_alloc_mutex); - page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); @@ -1002,7 +1001,6 @@ area_found: spin_lock_irqsave(&pcpu_lock, flags); if (ret) { - mutex_unlock(&pcpu_alloc_mutex); pcpu_free_area(chunk, off, &occ_pages); err = "failed to populate"; goto fail_unlock; @@ -1042,6 +1040,8 @@ fail: /* see the flag handling in pcpu_blance_workfn() */ pcpu_atomic_alloc_failed = true; pcpu_schedule_balance_work(); + } else { + mutex_unlock(&pcpu_alloc_mutex); } return NULL; } From d9e3c1d5f7bd8df275f8bce5ed78393c7a5fb22a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 8 Mar 2016 21:09:29 +0700 Subject: [PATCH 0512/3220] UPSTREAM: arm64: account for sparsemem section alignment when choosing vmemmap offset Commit dfd55ad85e4a ("arm64: vmemmap: use virtual projection of linear region") fixed an issue where the struct page array would overflow into the adjacent virtual memory region if system RAM was placed so high up in physical memory that its addresses were not representable in the build time configured virtual address size. However, the fix failed to take into account that the vmemmap region needs to be relatively aligned with respect to the sparsemem section size, so that a sequence of page structs corresponding with a sparsemem section in the linear region appears naturally aligned in the vmemmap region. So round up vmemmap to sparsemem section size. Since this essentially moves the projection of the linear region up in memory, also revert the reduction of the size of the vmemmap region. Cc: Fixes: dfd55ad85e4a ("arm64: vmemmap: use virtual projection of linear region") Tested-by: Mark Langsdorf Tested-by: David Daney Tested-by: Robert Richter Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit 36e5cd6b897e17d03008f81e075625d8e43e52d0) Signed-off-by: Jeff Vander Stoep Change-Id: I77bad8c6a7c1a7c3dda92a37ceef5ddfb196ec70 --- arch/arm64/include/asm/pgtable.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0c82a1d9f31e..40017aa2fcbd 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -40,13 +40,14 @@ * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space, * fixed mappings and modules */ -#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE) +#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) #define VMALLOC_START (MODULES_END) #define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) #define VMEMMAP_START (VMALLOC_END + SZ_64K) -#define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) +#define vmemmap ((struct page *)VMEMMAP_START - \ + SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT)) #define FIRST_USER_ADDRESS 0UL From 663cc539523fbf35732ba0af6a15bcb6d6370c91 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 18 Apr 2016 17:09:47 +0200 Subject: [PATCH 0513/3220] UPSTREAM: arm64: relocatable: deal with physically misaligned kernel images When booting a relocatable kernel image, there is no practical reason to refuse an image whose load address is not exactly TEXT_OFFSET bytes above a 2 MB aligned base address, as long as the physical and virtual misalignment with respect to the swapper block size are equal, and are both aligned to THREAD_SIZE. Since the virtual misalignment is under our control when we first enter the kernel proper, we can simply choose its value to be equal to the physical misalignment. So treat the misalignment of the physical load address as the initial KASLR offset, and fix up the remaining code to deal with that. Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 Bug: 32122850 (cherry picked from commit 08cdac619c81b3fa8cd73aeed2330ffe0a0b73ca) Signed-off-by: Jeff Vander Stoep Change-Id: I658cb3467ba9a4f5b1f5a1cbb972fdc5a3562bf0 --- arch/arm64/kernel/head.S | 9 ++++++--- arch/arm64/kernel/kaslr.c | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 9bfa58fea8ce..461d6cc258dd 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -210,8 +211,8 @@ section_table: ENTRY(stext) bl preserve_boot_args bl el2_setup // Drop to EL1, w20=cpu_boot_mode - mov x23, xzr // KASLR offset, defaults to 0 adrp x24, __PHYS_OFFSET + and x23, x24, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 bl set_cpu_boot_mode_flag bl __create_page_tables // x25=TTBR0, x26=TTBR1 /* @@ -488,11 +489,13 @@ __mmap_switched: bl kasan_early_init #endif #ifdef CONFIG_RANDOMIZE_BASE - cbnz x23, 0f // already running randomized? + tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized? + b.ne 0f mov x0, x21 // pass FDT address in x0 + mov x1, x23 // pass modulo offset in x1 bl kaslr_early_init // parse FDT for KASLR options cbz x0, 0f // KASLR disabled? just proceed - mov x23, x0 // record KASLR offset + orr x23, x23, x0 // record KASLR offset ret x28 // we must enable KASLR, return // to __enable_mmu() 0: diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 582983920054..b05469173ba5 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -74,7 +74,7 @@ extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, * containing function pointers) to be reinitialized, and zero-initialized * .bss variables will be reset to 0. */ -u64 __init kaslr_early_init(u64 dt_phys) +u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset) { void *fdt; u64 seed, offset, mask, module_range; @@ -132,8 +132,8 @@ u64 __init kaslr_early_init(u64 dt_phys) * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this * happens, increase the KASLR offset by the size of the kernel image. */ - if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) != - (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT)) + if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) != + (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT)) offset = (offset + (u64)(_end - _text)) & mask; if (IS_ENABLED(CONFIG_KASAN)) From e78f134a78a0ae95b83ac0cac47ab0bb584ebaa7 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 6 Oct 2016 15:53:38 -0700 Subject: [PATCH 0514/3220] CHROMIUM: remove Android's cgroup generic permissions checks The implementation is utterly broken, resulting in all processes being allows to move tasks between sets (as long as they have access to the "tasks" attribute), and upstream is heading towards checking only capability anyway, so let's get rid of this code. BUG=b:31790445,chromium:647994 TEST=Boot android container, examine logcat Change-Id: I2f780a5992c34e52a8f2d0b3557fc9d490da2779 Signed-off-by: Dmitry Torokhov Reviewed-on: https://chromium-review.googlesource.com/394967 Reviewed-by: Ricky Zhou Reviewed-by: John Stultz --- Documentation/cgroups/cgroups.txt | 9 ----- include/linux/cgroup-defs.h | 1 - include/linux/cgroup.h | 14 -------- kernel/cgroup.c | 59 ++----------------------------- kernel/sched/core.c | 1 - mm/memcontrol.c | 10 ------ 6 files changed, 2 insertions(+), 92 deletions(-) diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 2d984e20783b..c6256ae9885b 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -578,15 +578,6 @@ is completely unused; @cgrp->parent is still valid. (Note - can also be called for a newly-created cgroup if an error occurs after this subsystem's create() method has been called for the new cgroup). -int allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called prior to moving a task into a cgroup; if the subsystem -returns an error, this will abort the attach operation. Used -to extend the permission checks - if all subsystems in a cgroup -return 0, the attach will be allowed to proceed, even if the -default permission check (root or same user) fails. - int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) (cgroup_mutex held by caller) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4a4eea01956c..06b77f9dd3f2 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -422,7 +422,6 @@ struct cgroup_subsys { void (*css_reset)(struct cgroup_subsys_state *css); void (*css_e_css_changed)(struct cgroup_subsys_state *css); - int (*allow_attach)(struct cgroup_taskset *tset); int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 70358b9f5a7a..cb91b44f5f78 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -528,16 +528,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -/* - * Default Android check for whether the current process is allowed to move a - * task across cgroups, either because CAP_SYS_NICE is set or because the uid - * of the calling process is the same as the moved task or because we are - * running as root. - * Returns 0 if this is allowed, or -EACCES otherwise. - */ -int subsys_cgroup_allow_attach(struct cgroup_taskset *tset); - - #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -562,10 +552,6 @@ static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } -static inline int subsys_cgroup_allow_attach(void *tset) -{ - return -EINVAL; -} #endif /* !CONFIG_CGROUPS */ #endif /* _LINUX_CGROUP_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bd541053efc2..371ee5a827e0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2663,45 +2663,6 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, return ret; } -int subsys_cgroup_allow_attach(struct cgroup_taskset *tset) -{ - const struct cred *cred = current_cred(), *tcred; - struct task_struct *task; - struct cgroup_subsys_state *css; - - if (capable(CAP_SYS_NICE)) - return 0; - - cgroup_taskset_for_each(task, css, tset) { - tcred = __task_cred(task); - - if (current != task && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - return -EACCES; - } - - return 0; -} - -static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -{ - struct cgroup_subsys_state *css; - int i; - int ret; - - for_each_css(css, i, cgrp) { - if (css->ss->allow_attach) { - ret = css->ss->allow_attach(tset); - if (ret) - return ret; - } else { - return -EACCES; - } - } - - return 0; -} - static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) @@ -2716,24 +2677,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - /* - * if the default permission check fails, give each - * cgroup a chance to extend the permission check - */ - struct cgroup_taskset tset = { - .src_csets = LIST_HEAD_INIT(tset.src_csets), - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), - .csets = &tset.src_csets, - }; - struct css_set *cset; - cset = task_css_set(task); - list_add(&cset->mg_node, &tset.src_csets); - ret = cgroup_allow_attach(dst_cgrp, &tset); - list_del(&tset.src_csets); - if (ret) - ret = -EACCES; - } + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { struct super_block *sb = of->file->f_path.dentry->d_sb; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c90bfb620981..471094bfbcc6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8935,7 +8935,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .allow_attach = subsys_cgroup_allow_attach, .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c714f33f494..ee6acd279953 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4879,11 +4879,6 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) return ret; } -static int mem_cgroup_allow_attach(struct cgroup_taskset *tset) -{ - return subsys_cgroup_allow_attach(tset); -} - static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { if (mc.to) @@ -5045,10 +5040,6 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static int mem_cgroup_allow_attach(struct cgroup_taskset *tset) -{ - return 0; -} static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } @@ -5232,7 +5223,6 @@ struct cgroup_subsys memory_cgrp_subsys = { .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, - .allow_attach = mem_cgroup_allow_attach, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, From 140cab831036741cac88255f282cd68b72140356 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 6 Oct 2016 16:14:16 -0700 Subject: [PATCH 0515/3220] CHROMIUM: cgroups: relax permissions on moving tasks between cgroups Android expects system_server to be able to move tasks between different cgroups/cpusets, but does not want to be running as root. Let's relax permission check so that processes can move other tasks if they have CAP_SYS_NICE in the affected task's user namespace. BUG=b:31790445,chromium:647994 TEST=Boot android container, examine logcat Change-Id: Ia919c66ab6ed6a6daf7c4cf67feb38b13b1ad09b Signed-off-by: Dmitry Torokhov Reviewed-on: https://chromium-review.googlesource.com/394927 Reviewed-by: Ricky Zhou --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 371ee5a827e0..45c5e134d05b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2677,7 +2677,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + !uid_eq(cred->euid, tcred->suid) && + !ns_capable(tcred->user_ns, CAP_SYS_NICE)) ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { From 7af6188c36382c1a11551cf8bea62bcb3dcaa585 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 11 Oct 2016 13:51:27 -0700 Subject: [PATCH 0516/3220] BACKPORT: lib: harden strncpy_from_user The strncpy_from_user() accessor is effectively a copy_from_user() specialised to copy strings, terminating early at a NUL byte if possible. In other respects it is identical, and can be used to copy an arbitrarily large buffer from userspace into the kernel. Conceptually, it exposes a similar attack surface. As with copy_from_user(), we check the destination range when the kernel is built with KASAN, but unlike copy_from_user() we do not check the destination buffer when using HARDENED_USERCOPY. As strncpy_from_user() calls get_user() in a loop, we must call check_object_size() explicitly. This patch adds this instrumentation to strncpy_from_user(), per the same rationale as with the regular copy_from_user(). In the absence of hardened usercopy this will have no impact as the instrumentation expands to an empty static inline function. Link: http://lkml.kernel.org/r/1472221903-31181-1-git-send-email-mark.rutland@arm.com Signed-off-by: Mark Rutland Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Bug: 31374226 Change-Id: I898e4e9f19307e37a9be497cb1a0d7f1e3911661 (cherry picked from commit bf90e56e467ed5766722972d483e6711889ed1b0) Signed-off-by: Sami Tolvanen --- lib/strncpy_from_user.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 5a003a2ebd96..05efc1fa97f0 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -109,6 +110,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) unsigned long max = max_addr - src_addr; long retval; + check_object_size(dst, count, false); user_access_begin(); retval = do_strncpy_from_user(dst, src, count, max); user_access_end(); From 7bb5218b77d0aef456634625fff7909f88b4f705 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 18 Oct 2016 12:35:03 -0700 Subject: [PATCH 0517/3220] cgroup: Remove leftover instances of allow_attach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: kernel/sched/tune.c:718:2: error: unknown field ‘allow_attach’ specified in initializer kernel/cpuset.c:2087:2: error: unknown field 'allow_attach' specified in initializer Change-Id: Ie524350ffc6158f3182d90095cca502e58b6f197 Fixes: e78f134a78a0 ("CHROMIUM: remove Android's cgroup generic permissions checks") Signed-off-by: Guenter Roeck --- kernel/cpuset.c | 18 ------------------ kernel/sched/tune.c | 7 ------- 2 files changed, 25 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3b4f27981778..3d0f77112eb3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2061,30 +2061,12 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } -static int cpuset_allow_attach(struct cgroup_taskset *tset) -{ - const struct cred *cred = current_cred(), *tcred; - struct task_struct *task; - struct cgroup_subsys_state *css; - - cgroup_taskset_for_each(task, css, tset) { - tcred = __task_cred(task); - - if ((current != task) && !capable(CAP_SYS_ADMIN) && - cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val) - return -EACCES; - } - - return 0; -} - struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, - .allow_attach = cpuset_allow_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .bind = cpuset_bind, diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 505d7b35b0e1..68a24a044b0a 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -368,12 +368,6 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu) raw_spin_unlock_irqrestore(&bg->lock, irq_flags); } -int schedtune_allow_attach(struct cgroup_taskset *tset) -{ - /* We always allows tasks to be moved between existing CGroups */ - return 0; -} - int schedtune_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; @@ -715,7 +709,6 @@ schedtune_css_free(struct cgroup_subsys_state *css) struct cgroup_subsys schedtune_cgrp_subsys = { .css_alloc = schedtune_css_alloc, .css_free = schedtune_css_free, - .allow_attach = schedtune_allow_attach, .can_attach = schedtune_can_attach, .cancel_attach = schedtune_cancel_attach, .legacy_cftypes = files, From ffa907b7664062250d813d97fb9a9bdb5511897a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 17 Oct 2016 16:18:39 +0100 Subject: [PATCH 0518/3220] UPSTREAM: arm64: kaslr: keep modules close to the kernel when DYNAMIC_FTRACE=y The RANDOMIZE_MODULE_REGION_FULL Kconfig option allows KASLR to be configured in such a way that kernel modules and the core kernel are allocated completely independently, which implies that modules are likely to require branches via PLT entries to reach the core kernel. The dynamic ftrace code does not expect that, and assumes that it can patch module code to perform a relative branch to anywhere in the core kernel. This may result in errors such as branch_imm_common: offset out of range ------------[ cut here ]------------ WARNING: CPU: 3 PID: 196 at kernel/trace/ftrace.c:1995 ftrace_bug+0x220/0x2e8 Modules linked in: CPU: 3 PID: 196 Comm: systemd-udevd Not tainted 4.8.0-22-generic #24 Hardware name: AMD Seattle/Seattle, BIOS 10:34:40 Oct 6 2016 task: ffff8d1bef7dde80 task.stack: ffff8d1bef6b0000 PC is at ftrace_bug+0x220/0x2e8 LR is at ftrace_process_locs+0x330/0x430 So make RANDOMIZE_MODULE_REGION_FULL mutually exclusive with DYNAMIC_FTRACE at the Kconfig level. Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit 8fe88a4145cdeee486af60e61f5d5a14f804fa45) Signed-off-by: Jeff Vander Stoep Change-Id: Ifb2474dcbb7a3066fe5724ee53a2048d61e80ccc --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 488fed2d5e5e..a4d3c3cd7b23 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -822,7 +822,7 @@ config RANDOMIZE_BASE config RANDOMIZE_MODULE_REGION_FULL bool "Randomize the module region independently from the core kernel" - depends on RANDOMIZE_BASE + depends on RANDOMIZE_BASE && !DYNAMIC_FTRACE default y help Randomizes the location of the module region without considering the From 4a8b645cef3caa87b85269078d9e1f02115107b6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 13 Oct 2016 17:42:09 +0100 Subject: [PATCH 0519/3220] UPSTREAM: arm64: kaslr: fix breakage with CONFIG_MODVERSIONS=y As it turns out, the KASLR code breaks CONFIG_MODVERSIONS, since the kcrctab has an absolute address field that is relocated at runtime when the kernel offset is randomized. This has been fixed already for PowerPC in the past, so simply wire up the existing code dealing with this issue. Cc: Fixes: f80fb3a3d508 ("arm64: add support for kernel ASLR") Tested-by: Timur Tabi Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Bug: 30369029 (cherry picked from commit 8fe88a4145cdeee486af60e61f5d5a14f804fa45) Signed-off-by: Jeff Vander Stoep Change-Id: Ia40bb68eb5ba7df14214243657948d469f1d5717 --- arch/arm64/include/asm/module.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index e12af6754634..06ff7fd9e81f 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -17,6 +17,7 @@ #define __ASM_MODULE_H #include +#include #define MODULE_ARCH_VERMAGIC "aarch64" @@ -32,6 +33,10 @@ u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, Elf64_Sym *sym); #ifdef CONFIG_RANDOMIZE_BASE +#ifdef CONFIG_MODVERSIONS +#define ARCH_RELOCATES_KCRCTAB +#define reloc_start (kimage_vaddr - KIMAGE_VADDR) +#endif extern u64 module_alloc_base; #else #define module_alloc_base ((u64)_etext - MODULES_VSIZE) From a3c8dc25c446485dcbf8a978f755eead9eac3671 Mon Sep 17 00:00:00 2001 From: Lianwei Wang Date: Thu, 9 Jun 2016 23:43:28 -0700 Subject: [PATCH 0520/3220] UPSTREAM: cpu/hotplug: Handle unbalanced hotplug enable/disable (cherry picked from commit 01b41159066531cc8d664362ff0cd89dd137bbfa) When cpu_hotplug_enable() is called unbalanced w/o a preceeding cpu_hotplug_disable() the code emits a warning, but happily decrements the disabled counter. This causes the next operations to malfunction. Prevent the decrement and just emit a warning. Signed-off-by: Lianwei Wang Cc: peterz@infradead.org Cc: linux-pm@vger.kernel.org Cc: oleg@redhat.com Link: http://lkml.kernel.org/r/1465541008-12476-1-git-send-email-lianwei.wang@gmail.com Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ced7c751648..c8a1751be224 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -185,10 +185,17 @@ void cpu_hotplug_disable(void) } EXPORT_SYMBOL_GPL(cpu_hotplug_disable); +static void __cpu_hotplug_enable(void) +{ + if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) + return; + cpu_hotplug_disabled--; +} + void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); @@ -631,7 +638,7 @@ void enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); if (cpumask_empty(frozen_cpus)) goto out; From 64f0245f312a849acc83b7c6cdcc8f7c1a14cca3 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 18 Oct 2016 16:20:23 -0700 Subject: [PATCH 0521/3220] [RFC]cgroup: Change from CAP_SYS_NICE to CAP_SYS_RESOURCE for cgroup migration permissions Try to better match what we're pushing upstream, use CAP_SYS_RESOURCE instead of CAP_SYS_NICE, which shoudln't affect Android as Zygote and system_server already use CAP_SYS_RESOURCE. Signed-off-by: John Stultz --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 45c5e134d05b..00af24ac0167 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2678,7 +2678,7 @@ static int cgroup_procs_write_permission(struct task_struct *task, if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid) && - !ns_capable(tcred->user_ns, CAP_SYS_NICE)) + !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { From b6be9b7b199b05a13d72dc562110eebc725ed0ec Mon Sep 17 00:00:00 2001 From: Daniel Micay Date: Thu, 20 Oct 2016 15:45:01 -0400 Subject: [PATCH 0522/3220] disable aio support in recommended configuration The aio interface adds substantial attack surface for a feature that's not being exposed by Android at all. It's unlikely that anyone is using the kernel feature directly either. This feature is rarely used even on servers. The glibc POSIX aio calls really use thread pools. The lack of widespread usage also means this is relatively poorly audited/tested. The kernel's aio rarely provides performance benefits over using a thread pool and is quite incomplete in terms of system call coverage along with having edge cases where blocking can occur. Part of the performance issue is the fact that it only supports direct io, not buffered io. The existing API is considered fundamentally flawed and it's unlikely it will be expanded, but rather replaced: https://marc.info/?l=linux-aio&m=145255815216051&w=2 Since ext4 encryption means no direct io support, kernel aio isn't even going to work properly on Android devices using file-based encryption. Change-Id: Iccc7cab4437791240817e6275a23e1d3f4a47f2d Signed-off-by: Daniel Micay --- android/configs/android-recommended.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 3465a848d74d..3fd0b13488a1 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -1,4 +1,5 @@ # KEEP ALPHABETICALLY SORTED +# CONFIG_AIO is not set # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_LEGACY_PTYS is not set From ce0c65986ceeaa05249c990e1744d4caeb734e60 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Wed, 13 Jul 2016 12:06:49 +0200 Subject: [PATCH 0523/3220] android: binder: split flat_binder_object. flat_binder_object is used for both handling binder objects and file descriptors, even though the two are mostly independent. Since we'll have more fixup objects in binder in the future, instead of extending flat_binder_object again, split out file descriptors to their own object while retaining backwards compatibility to existing user-space clients. All binder objects just share a header. Change-Id: If3c55f27a2aa8f21815383e0e807be47895e4786 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 162 +++++++++++++++++++--------- include/uapi/linux/android/binder.h | 31 +++++- 2 files changed, 141 insertions(+), 52 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index b8ba5b7516b9..50cc56f3d0cc 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -146,6 +146,11 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, binder_stop_on_user_error = 2; \ } while (0) +#define to_flat_binder_object(hdr) \ + container_of(hdr, struct flat_binder_object, hdr) + +#define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr) + enum binder_stat_types { BINDER_STAT_PROC, BINDER_STAT_THREAD, @@ -1241,6 +1246,47 @@ static void binder_send_failed_reply(struct binder_transaction *t, } } +/** + * binder_validate_object() - checks for a valid metadata object in a buffer. + * @buffer: binder_buffer that we're parsing. + * @offset: offset in the buffer at which to validate an object. + * + * Return: If there's a valid metadata object at @offset in @buffer, the + * size of that object. Otherwise, it returns zero. + */ +static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset) +{ + /* Check if we can read a header first */ + struct binder_object_header *hdr; + size_t object_size = 0; + + if (offset > buffer->data_size - sizeof(*hdr) || + buffer->data_size < sizeof(*hdr) || + !IS_ALIGNED(offset, sizeof(u32))) + return 0; + + /* Ok, now see if we can read a complete object. */ + hdr = (struct binder_object_header *)(buffer->data + offset); + switch (hdr->type) { + case BINDER_TYPE_BINDER: + case BINDER_TYPE_WEAK_BINDER: + case BINDER_TYPE_HANDLE: + case BINDER_TYPE_WEAK_HANDLE: + object_size = sizeof(struct flat_binder_object); + break; + case BINDER_TYPE_FD: + object_size = sizeof(struct binder_fd_object); + break; + default: + return 0; + } + if (offset <= buffer->data_size - object_size && + buffer->data_size >= object_size) + return object_size; + else + return 0; +} + static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_buffer *buffer, binder_size_t *failed_at) @@ -1263,21 +1309,23 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, else off_end = (void *)offp + buffer->offsets_size; for (; offp < off_end; offp++) { - struct flat_binder_object *fp; + struct binder_object_header *hdr; + size_t object_size = binder_validate_object(buffer, *offp); - if (*offp > buffer->data_size - sizeof(*fp) || - buffer->data_size < sizeof(*fp) || - !IS_ALIGNED(*offp, sizeof(u32))) { - pr_err("transaction release %d bad offset %lld, size %zd\n", + if (object_size == 0) { + pr_err("transaction release %d bad object at offset %lld, size %zd\n", debug_id, (u64)*offp, buffer->data_size); continue; } - fp = (struct flat_binder_object *)(buffer->data + *offp); - switch (fp->type) { + hdr = (struct binder_object_header *)(buffer->data + *offp); + switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { - struct binder_node *node = binder_get_node(proc, fp->binder); + struct flat_binder_object *fp; + struct binder_node *node; + fp = to_flat_binder_object(hdr); + node = binder_get_node(proc, fp->binder); if (node == NULL) { pr_err("transaction release %d bad node %016llx\n", debug_id, (u64)fp->binder); @@ -1286,13 +1334,17 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx\n", node->debug_id, (u64)node->ptr); - binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, 0); + binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER, + 0); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, fp->handle, - fp->type == BINDER_TYPE_HANDLE); + struct flat_binder_object *fp; + struct binder_ref *ref; + fp = to_flat_binder_object(hdr); + ref = binder_get_ref(proc, fp->handle, + hdr->type == BINDER_TYPE_HANDLE); if (ref == NULL) { pr_err("transaction release %d bad handle %d\n", debug_id, fp->handle); @@ -1301,19 +1353,21 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d (node %d)\n", ref->debug_id, ref->desc, ref->node->debug_id); - binder_dec_ref(ref, fp->type == BINDER_TYPE_HANDLE); + binder_dec_ref(ref, hdr->type == BINDER_TYPE_HANDLE); } break; - case BINDER_TYPE_FD: + case BINDER_TYPE_FD: { + struct binder_fd_object *fp = to_binder_fd_object(hdr); + binder_debug(BINDER_DEBUG_TRANSACTION, - " fd %d\n", fp->handle); + " fd %d\n", fp->fd); if (failed_at) - task_close_fd(proc, fp->handle); - break; + task_close_fd(proc, fp->fd); + } break; default: pr_err("transaction release %d bad object type %x\n", - debug_id, fp->type); + debug_id, hdr->type); break; } } @@ -1530,28 +1584,29 @@ static void binder_transaction(struct binder_proc *proc, off_end = (void *)offp + tr->offsets_size; off_min = 0; for (; offp < off_end; offp++) { - struct flat_binder_object *fp; + struct binder_object_header *hdr; + size_t object_size = binder_validate_object(t->buffer, *offp); - if (*offp > t->buffer->data_size - sizeof(*fp) || - *offp < off_min || - t->buffer->data_size < sizeof(*fp) || - !IS_ALIGNED(*offp, sizeof(u32))) { - binder_user_error("%d:%d got transaction with invalid offset, %lld (min %lld, max %lld)\n", + if (object_size == 0 || *offp < off_min) { + binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n", proc->pid, thread->pid, (u64)*offp, (u64)off_min, - (u64)(t->buffer->data_size - - sizeof(*fp))); + (u64)t->buffer->data_size); return_error = BR_FAILED_REPLY; goto err_bad_offset; } - fp = (struct flat_binder_object *)(t->buffer->data + *offp); - off_min = *offp + sizeof(struct flat_binder_object); - switch (fp->type) { + + hdr = (struct binder_object_header *)(t->buffer->data + *offp); + off_min = *offp + object_size; + switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { + struct flat_binder_object *fp; + struct binder_node *node; struct binder_ref *ref; - struct binder_node *node = binder_get_node(proc, fp->binder); + fp = to_flat_binder_object(hdr); + node = binder_get_node(proc, fp->binder); if (node == NULL) { node = binder_new_node(proc, fp->binder, fp->cookie); if (node == NULL) { @@ -1579,14 +1634,14 @@ static void binder_transaction(struct binder_proc *proc, return_error = BR_FAILED_REPLY; goto err_binder_get_ref_for_node_failed; } - if (fp->type == BINDER_TYPE_BINDER) - fp->type = BINDER_TYPE_HANDLE; + if (hdr->type == BINDER_TYPE_BINDER) + hdr->type = BINDER_TYPE_HANDLE; else - fp->type = BINDER_TYPE_WEAK_HANDLE; + hdr->type = BINDER_TYPE_WEAK_HANDLE; fp->binder = 0; fp->handle = ref->desc; fp->cookie = 0; - binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE, + binder_inc_ref(ref, hdr->type == BINDER_TYPE_HANDLE, &thread->todo); trace_binder_transaction_node_to_ref(t, node, ref); @@ -1597,9 +1652,12 @@ static void binder_transaction(struct binder_proc *proc, } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, fp->handle, - fp->type == BINDER_TYPE_HANDLE); + struct flat_binder_object *fp; + struct binder_ref *ref; + fp = to_flat_binder_object(hdr); + ref = binder_get_ref(proc, fp->handle, + hdr->type == BINDER_TYPE_HANDLE); if (ref == NULL) { binder_user_error("%d:%d got transaction with invalid handle, %d\n", proc->pid, @@ -1613,13 +1671,15 @@ static void binder_transaction(struct binder_proc *proc, goto err_binder_get_ref_failed; } if (ref->node->proc == target_proc) { - if (fp->type == BINDER_TYPE_HANDLE) - fp->type = BINDER_TYPE_BINDER; + if (hdr->type == BINDER_TYPE_HANDLE) + hdr->type = BINDER_TYPE_BINDER; else - fp->type = BINDER_TYPE_WEAK_BINDER; + hdr->type = BINDER_TYPE_WEAK_BINDER; fp->binder = ref->node->ptr; fp->cookie = ref->node->cookie; - binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL); + binder_inc_node(ref->node, + hdr->type == BINDER_TYPE_BINDER, + 0, NULL); trace_binder_transaction_ref_to_node(t, ref); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> node %d u%016llx\n", @@ -1636,7 +1696,9 @@ static void binder_transaction(struct binder_proc *proc, fp->binder = 0; fp->handle = new_ref->desc; fp->cookie = 0; - binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL); + binder_inc_ref(new_ref, + hdr->type == BINDER_TYPE_HANDLE, + NULL); trace_binder_transaction_ref_to_ref(t, ref, new_ref); binder_debug(BINDER_DEBUG_TRANSACTION, @@ -1649,25 +1711,26 @@ static void binder_transaction(struct binder_proc *proc, case BINDER_TYPE_FD: { int target_fd; struct file *file; + struct binder_fd_object *fp = to_binder_fd_object(hdr); if (reply) { if (!(in_reply_to->flags & TF_ACCEPT_FDS)) { binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n", - proc->pid, thread->pid, fp->handle); + proc->pid, thread->pid, fp->fd); return_error = BR_FAILED_REPLY; goto err_fd_not_allowed; } } else if (!target_node->accept_fds) { binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n", - proc->pid, thread->pid, fp->handle); + proc->pid, thread->pid, fp->fd); return_error = BR_FAILED_REPLY; goto err_fd_not_allowed; } - file = fget(fp->handle); + file = fget(fp->fd); if (file == NULL) { binder_user_error("%d:%d got transaction with invalid fd, %d\n", - proc->pid, thread->pid, fp->handle); + proc->pid, thread->pid, fp->fd); return_error = BR_FAILED_REPLY; goto err_fget_failed; } @@ -1685,17 +1748,18 @@ static void binder_transaction(struct binder_proc *proc, goto err_get_unused_fd_failed; } task_fd_install(target_proc, target_fd, file); - trace_binder_transaction_fd(t, fp->handle, target_fd); + trace_binder_transaction_fd(t, fp->fd, target_fd); binder_debug(BINDER_DEBUG_TRANSACTION, - " fd %d -> %d\n", fp->handle, target_fd); + " fd %d -> %d\n", fp->fd, + target_fd); /* TODO: fput? */ - fp->binder = 0; - fp->handle = target_fd; + fp->pad_binder = 0; + fp->fd = target_fd; } break; default: binder_user_error("%d:%d got transaction with invalid object type, %x\n", - proc->pid, thread->pid, fp->type); + proc->pid, thread->pid, hdr->type); return_error = BR_FAILED_REPLY; goto err_bad_object_type; } diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 41420e341e75..f67c2b1c0713 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -48,6 +48,14 @@ typedef __u64 binder_size_t; typedef __u64 binder_uintptr_t; #endif +/** + * struct binder_object_header - header shared by all binder metadata objects. + * @type: type of the object + */ +struct binder_object_header { + __u32 type; +}; + /* * This is the flattened representation of a Binder object for transfer * between processes. The 'offsets' supplied as part of a binder transaction @@ -56,9 +64,8 @@ typedef __u64 binder_uintptr_t; * between processes. */ struct flat_binder_object { - /* 8 bytes for large_flat_header. */ - __u32 type; - __u32 flags; + struct binder_object_header hdr; + __u32 flags; /* 8 bytes of data. */ union { @@ -70,6 +77,24 @@ struct flat_binder_object { binder_uintptr_t cookie; }; +/** + * struct binder_fd_object - describes a filedescriptor to be fixed up. + * @hdr: common header structure + * @pad_flags: padding to remain compatible with old userspace code + * @pad_binder: padding to remain compatible with old userspace code + * @fd: file descriptor + * @cookie: opaque data, used by user-space + */ +struct binder_fd_object { + struct binder_object_header hdr; + __u32 pad_flags; + union { + binder_uintptr_t pad_binder; + __u32 fd; + }; + + binder_uintptr_t cookie; +}; /* * On 64-bit platforms where user code may run in 32-bits the driver must * translate the buffer (and local binder) addresses appropriately. From 803df5635e4ac377586e3664cf798c3152670bc0 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 30 Sep 2016 15:51:48 +0200 Subject: [PATCH 0524/3220] android: binder: support multiple context managers. Move the context manager state into a separate struct context, and allow for each process to have its own context associated with it. Change-Id: Ifa934370241a2d447dd519eac3fd0682c6d00ab4 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 60 ++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 50cc56f3d0cc..93780d1cc199 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -56,8 +56,6 @@ static HLIST_HEAD(binder_dead_nodes); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; -static struct binder_node *binder_context_mgr_node; -static kuid_t binder_context_mgr_uid = INVALID_UID; static int binder_last_id; static struct workqueue_struct *binder_deferred_workqueue; @@ -216,6 +214,15 @@ static struct binder_transaction_log_entry *binder_transaction_log_add( return e; } +struct binder_context { + struct binder_node *binder_context_mgr_node; + kuid_t binder_context_mgr_uid; +}; + +static struct binder_context global_context = { + .binder_context_mgr_uid = INVALID_UID, +}; + struct binder_work { struct list_head entry; enum { @@ -331,6 +338,7 @@ struct binder_proc { int ready_threads; long default_priority; struct dentry *debugfs_entry; + struct binder_context *context; }; enum { @@ -935,8 +943,10 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal, if (internal) { if (target_list == NULL && node->internal_strong_refs == 0 && - !(node == binder_context_mgr_node && - node->has_strong_ref)) { + !(node->proc && + node == node->proc->context-> + binder_context_mgr_node && + node->has_strong_ref)) { pr_err("invalid inc strong node for %d\n", node->debug_id); return -EINVAL; @@ -1037,6 +1047,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, struct rb_node **p = &proc->refs_by_node.rb_node; struct rb_node *parent = NULL; struct binder_ref *ref, *new_ref; + struct binder_context *context = proc->context; while (*p) { parent = *p; @@ -1059,7 +1070,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, rb_link_node(&new_ref->rb_node_node, parent, p); rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node); - new_ref->desc = (node == binder_context_mgr_node) ? 0 : 1; + new_ref->desc = (node == context->binder_context_mgr_node) ? 0 : 1; for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (ref->desc > new_ref->desc) @@ -1389,6 +1400,7 @@ static void binder_transaction(struct binder_proc *proc, struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; uint32_t return_error; + struct binder_context *context = proc->context; e = binder_transaction_log_add(&binder_transaction_log); e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); @@ -1449,7 +1461,7 @@ static void binder_transaction(struct binder_proc *proc, } target_node = ref->node; } else { - target_node = binder_context_mgr_node; + target_node = context->binder_context_mgr_node; if (target_node == NULL) { return_error = BR_DEAD_REPLY; goto err_no_context_mgr_node; @@ -1840,6 +1852,7 @@ static int binder_thread_write(struct binder_proc *proc, binder_size_t *consumed) { uint32_t cmd; + struct binder_context *context = proc->context; void __user *buffer = (void __user *)(uintptr_t)binder_buffer; void __user *ptr = buffer + *consumed; void __user *end = buffer + size; @@ -1866,10 +1879,10 @@ static int binder_thread_write(struct binder_proc *proc, if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); - if (target == 0 && binder_context_mgr_node && + if (target == 0 && context->binder_context_mgr_node && (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) { ref = binder_get_ref_for_node(proc, - binder_context_mgr_node); + context->binder_context_mgr_node); if (ref->desc != target) { binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n", proc->pid, thread->pid, @@ -2775,9 +2788,11 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) { int ret = 0; struct binder_proc *proc = filp->private_data; + struct binder_context *context = proc->context; + kuid_t curr_euid = current_euid(); - if (binder_context_mgr_node != NULL) { + if (context->binder_context_mgr_node) { pr_err("BINDER_SET_CONTEXT_MGR already set\n"); ret = -EBUSY; goto out; @@ -2785,27 +2800,27 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) ret = security_binder_set_context_mgr(proc->tsk); if (ret < 0) goto out; - if (uid_valid(binder_context_mgr_uid)) { - if (!uid_eq(binder_context_mgr_uid, curr_euid)) { + if (uid_valid(context->binder_context_mgr_uid)) { + if (!uid_eq(context->binder_context_mgr_uid, curr_euid)) { pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", from_kuid(&init_user_ns, curr_euid), from_kuid(&init_user_ns, - binder_context_mgr_uid)); + context->binder_context_mgr_uid)); ret = -EPERM; goto out; } } else { - binder_context_mgr_uid = curr_euid; + context->binder_context_mgr_uid = curr_euid; } - binder_context_mgr_node = binder_new_node(proc, 0, 0); - if (binder_context_mgr_node == NULL) { + context->binder_context_mgr_node = binder_new_node(proc, 0, 0); + if (!context->binder_context_mgr_node) { ret = -ENOMEM; goto out; } - binder_context_mgr_node->local_weak_refs++; - binder_context_mgr_node->local_strong_refs++; - binder_context_mgr_node->has_strong_ref = 1; - binder_context_mgr_node->has_weak_ref = 1; + context->binder_context_mgr_node->local_weak_refs++; + context->binder_context_mgr_node->local_strong_refs++; + context->binder_context_mgr_node->has_strong_ref = 1; + context->binder_context_mgr_node->has_weak_ref = 1; out: return ret; } @@ -3035,6 +3050,7 @@ static int binder_open(struct inode *nodp, struct file *filp) return -ENOMEM; get_task_struct(current); proc->tsk = current; + proc->context = &global_context; INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->wait); proc->default_priority = task_nice(current); @@ -3147,6 +3163,7 @@ static int binder_node_release(struct binder_node *node, int refs) static void binder_deferred_release(struct binder_proc *proc) { struct binder_transaction *t; + struct binder_context *context = proc->context; struct rb_node *n; int threads, nodes, incoming_refs, outgoing_refs, buffers, active_transactions, page_count; @@ -3156,11 +3173,12 @@ static void binder_deferred_release(struct binder_proc *proc) hlist_del(&proc->proc_node); - if (binder_context_mgr_node && binder_context_mgr_node->proc == proc) { + if (context->binder_context_mgr_node && + context->binder_context_mgr_node->proc == proc) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "%s: %d context_mgr_node gone\n", __func__, proc->pid); - binder_context_mgr_node = NULL; + context->binder_context_mgr_node = NULL; } threads = 0; From 8b980bee79719478fddc7ff595386b0c32661a33 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Mon, 17 Oct 2016 15:17:31 +0200 Subject: [PATCH 0525/3220] android: binder: deal with contexts in debugfs. Properly print the context in debugfs entries. Change-Id: If10c2129536d9f39bae542afd7318ca79af60e3a Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 93780d1cc199..635be4d9b8f3 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -190,6 +190,7 @@ struct binder_transaction_log_entry { int to_node; int data_size; int offsets_size; + const char *context_name; }; struct binder_transaction_log { int next; @@ -217,10 +218,12 @@ static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_context { struct binder_node *binder_context_mgr_node; kuid_t binder_context_mgr_uid; + const char *name; }; static struct binder_context global_context = { .binder_context_mgr_uid = INVALID_UID, + .name = "binder", }; struct binder_work { @@ -1409,6 +1412,7 @@ static void binder_transaction(struct binder_proc *proc, e->target_handle = tr->target.handle; e->data_size = tr->data_size; e->offsets_size = tr->offsets_size; + e->context_name = proc->context->name; if (reply) { in_reply_to = thread->transaction_stack; @@ -3069,8 +3073,17 @@ static int binder_open(struct inode *nodp, struct file *filp) char strbuf[11]; snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); + /* + * proc debug entries are shared between contexts, so + * this will fail if the process tries to open the driver + * again with a different context. The priting code will + * anyway print all contexts that a given PID has, so this + * is not a problem. + */ proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO, - binder_debugfs_dir_entry_proc, proc, &binder_proc_fops); + binder_debugfs_dir_entry_proc, + (void *)(unsigned long)proc->pid, + &binder_proc_fops); } return 0; @@ -3465,6 +3478,7 @@ static void print_binder_proc(struct seq_file *m, size_t header_pos; seq_printf(m, "proc %d\n", proc->pid); + seq_printf(m, "context %s\n", proc->context->name); header_pos = m->count; for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) @@ -3589,6 +3603,7 @@ static void print_binder_proc_stats(struct seq_file *m, int count, strong, weak; seq_printf(m, "proc %d\n", proc->pid); + seq_printf(m, "context %s\n", proc->context->name); count = 0; for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) count++; @@ -3696,23 +3711,18 @@ static int binder_transactions_show(struct seq_file *m, void *unused) static int binder_proc_show(struct seq_file *m, void *unused) { struct binder_proc *itr; - struct binder_proc *proc = m->private; + int pid = (unsigned long)m->private; int do_lock = !binder_debug_no_lock; - bool valid_proc = false; if (do_lock) binder_lock(__func__); hlist_for_each_entry(itr, &binder_procs, proc_node) { - if (itr == proc) { - valid_proc = true; - break; + if (itr->pid == pid) { + seq_puts(m, "binder proc state:\n"); + print_binder_proc(m, itr, 1); } } - if (valid_proc) { - seq_puts(m, "binder proc state:\n"); - print_binder_proc(m, proc, 1); - } if (do_lock) binder_unlock(__func__); return 0; @@ -3722,11 +3732,11 @@ static void print_binder_transaction_log_entry(struct seq_file *m, struct binder_transaction_log_entry *e) { seq_printf(m, - "%d: %s from %d:%d to %d:%d node %d handle %d size %d:%d\n", + "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d\n", e->debug_id, (e->call_type == 2) ? "reply" : ((e->call_type == 1) ? "async" : "call "), e->from_proc, - e->from_thread, e->to_proc, e->to_thread, e->to_node, - e->target_handle, e->data_size, e->offsets_size); + e->from_thread, e->to_proc, e->to_thread, e->context_name, + e->to_node, e->target_handle, e->data_size, e->offsets_size); } static int binder_transaction_log_show(struct seq_file *m, void *unused) From 04e3812e712fce296da074fbfba6519a77678dfb Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 30 Sep 2016 16:08:09 +0200 Subject: [PATCH 0526/3220] android: binder: support multiple /dev instances. Add a new module parameter 'devices', that can be used to specify the names of the binder device nodes we want to populate in /dev. Each device node has its own context manager, and is therefore logically separated from all the other device nodes. The config option CONFIG_ANDROID_BINDER_DEVICES can be used to set the default value of the parameter. This approach was favored over using IPC namespaces, mostly because we require a single process to be a part of multiple binder contexts, which seemed harder to achieve with namespaces. Change-Id: I3df72b2a19b5ad5a0360e6322482db7b00a12b24 Signed-off-by: Martijn Coenen --- drivers/android/Kconfig | 12 ++++++ drivers/android/binder.c | 85 ++++++++++++++++++++++++++++++++++------ 2 files changed, 86 insertions(+), 11 deletions(-) diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index bdfc6c6f4f5a..a82fc022d34b 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -19,6 +19,18 @@ config ANDROID_BINDER_IPC Android process, using Binder to identify, invoke and pass arguments between said processes. +config ANDROID_BINDER_DEVICES + string "Android Binder devices" + depends on ANDROID_BINDER_IPC + default "binder" + ---help--- + Default value for the binder.devices parameter. + + The binder.devices parameter is a comma-separated list of strings + that specifies the names of the binder device nodes that will be + created. Each binder device has its own context manager, and is + therefore logically separated from the other devices. + config ANDROID_BINDER_IPC_32BIT bool depends on !64BIT && ANDROID_BINDER_IPC diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 635be4d9b8f3..29700728bbd1 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -50,6 +50,7 @@ static DEFINE_MUTEX(binder_main_lock); static DEFINE_MUTEX(binder_deferred_lock); static DEFINE_MUTEX(binder_mmap_lock); +static HLIST_HEAD(binder_devices); static HLIST_HEAD(binder_procs); static HLIST_HEAD(binder_deferred_list); static HLIST_HEAD(binder_dead_nodes); @@ -114,6 +115,9 @@ module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO); static bool binder_debug_no_lock; module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO); +static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; +module_param_named(devices, binder_devices_param, charp, S_IRUGO); + static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); static int binder_stop_on_user_error; @@ -221,9 +225,10 @@ struct binder_context { const char *name; }; -static struct binder_context global_context = { - .binder_context_mgr_uid = INVALID_UID, - .name = "binder", +struct binder_device { + struct hlist_node hlist; + struct miscdevice miscdev; + struct binder_context context; }; struct binder_work { @@ -3045,6 +3050,7 @@ err_bad_arg: static int binder_open(struct inode *nodp, struct file *filp) { struct binder_proc *proc; + struct binder_device *binder_dev; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_open: %d:%d\n", current->group_leader->pid, current->pid); @@ -3054,10 +3060,12 @@ static int binder_open(struct inode *nodp, struct file *filp) return -ENOMEM; get_task_struct(current); proc->tsk = current; - proc->context = &global_context; INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->wait); proc->default_priority = task_nice(current); + binder_dev = container_of(filp->private_data, struct binder_device, + miscdev); + proc->context = &binder_dev->context; binder_lock(__func__); @@ -3764,20 +3772,44 @@ static const struct file_operations binder_fops = { .release = binder_release, }; -static struct miscdevice binder_miscdev = { - .minor = MISC_DYNAMIC_MINOR, - .name = "binder", - .fops = &binder_fops -}; - BINDER_DEBUG_ENTRY(state); BINDER_DEBUG_ENTRY(stats); BINDER_DEBUG_ENTRY(transactions); BINDER_DEBUG_ENTRY(transaction_log); +static int __init init_binder_device(const char *name) +{ + int ret; + struct binder_device *binder_device; + + binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL); + if (!binder_device) + return -ENOMEM; + + binder_device->miscdev.fops = &binder_fops; + binder_device->miscdev.minor = MISC_DYNAMIC_MINOR; + binder_device->miscdev.name = name; + + binder_device->context.binder_context_mgr_uid = INVALID_UID; + binder_device->context.name = name; + + ret = misc_register(&binder_device->miscdev); + if (ret < 0) { + kfree(binder_device); + return ret; + } + + hlist_add_head(&binder_device->hlist, &binder_devices); + + return ret; +} + static int __init binder_init(void) { int ret; + char *device_name, *device_names; + struct binder_device *device; + struct hlist_node *tmp; binder_deferred_workqueue = create_singlethread_workqueue("binder"); if (!binder_deferred_workqueue) @@ -3787,7 +3819,7 @@ static int __init binder_init(void) if (binder_debugfs_dir_entry_root) binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", binder_debugfs_dir_entry_root); - ret = misc_register(&binder_miscdev); + if (binder_debugfs_dir_entry_root) { debugfs_create_file("state", S_IRUGO, @@ -3815,6 +3847,37 @@ static int __init binder_init(void) &binder_transaction_log_failed, &binder_transaction_log_fops); } + + /* + * Copy the module_parameter string, because we don't want to + * tokenize it in-place. + */ + device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL); + if (!device_names) { + ret = -ENOMEM; + goto err_alloc_device_names_failed; + } + strcpy(device_names, binder_devices_param); + + while ((device_name = strsep(&device_names, ","))) { + ret = init_binder_device(device_name); + if (ret) + goto err_init_binder_device_failed; + } + + return ret; + +err_init_binder_device_failed: + hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) { + misc_deregister(&device->miscdev); + hlist_del(&device->hlist); + kfree(device); + } +err_alloc_device_names_failed: + debugfs_remove_recursive(binder_debugfs_dir_entry_root); + + destroy_workqueue(binder_deferred_workqueue); + return ret; } From bfd49fea44974857219e7387a11534c12d2440fc Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Thu, 29 Sep 2016 15:38:14 +0200 Subject: [PATCH 0527/3220] android: binder: refactor binder_transact() Moved handling of fixup for binder objects, handles and file descriptors into separate functions. Change-Id: If6849f1caee3834aa87d0ab08950bb1e21ec6e38 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 309 ++++++++++++++++++++++----------------- 1 file changed, 172 insertions(+), 137 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 29700728bbd1..ef42b10c41ad 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1392,10 +1392,172 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, } } +static int binder_translate_binder(struct flat_binder_object *fp, + struct binder_transaction *t, + struct binder_thread *thread) +{ + struct binder_node *node; + struct binder_ref *ref; + struct binder_proc *proc = thread->proc; + struct binder_proc *target_proc = t->to_proc; + + node = binder_get_node(proc, fp->binder); + if (!node) { + node = binder_new_node(proc, fp->binder, fp->cookie); + if (!node) + return -ENOMEM; + + node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK; + node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); + } + if (fp->cookie != node->cookie) { + binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", + proc->pid, thread->pid, (u64)fp->binder, + node->debug_id, (u64)fp->cookie, + (u64)node->cookie); + return -EINVAL; + } + if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) + return -EPERM; + + ref = binder_get_ref_for_node(target_proc, node); + if (!ref) + return -EINVAL; + + if (fp->hdr.type == BINDER_TYPE_BINDER) + fp->hdr.type = BINDER_TYPE_HANDLE; + else + fp->hdr.type = BINDER_TYPE_WEAK_HANDLE; + fp->binder = 0; + fp->handle = ref->desc; + fp->cookie = 0; + binder_inc_ref(ref, fp->hdr.type == BINDER_TYPE_HANDLE, &thread->todo); + + trace_binder_transaction_node_to_ref(t, node, ref); + binder_debug(BINDER_DEBUG_TRANSACTION, + " node %d u%016llx -> ref %d desc %d\n", + node->debug_id, (u64)node->ptr, + ref->debug_id, ref->desc); + + return 0; +} + +static int binder_translate_handle(struct flat_binder_object *fp, + struct binder_transaction *t, + struct binder_thread *thread) +{ + struct binder_ref *ref; + struct binder_proc *proc = thread->proc; + struct binder_proc *target_proc = t->to_proc; + + ref = binder_get_ref(proc, fp->handle, + fp->hdr.type == BINDER_TYPE_HANDLE); + if (!ref) { + binder_user_error("%d:%d got transaction with invalid handle, %d\n", + proc->pid, thread->pid, fp->handle); + return -EINVAL; + } + if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) + return -EPERM; + + if (ref->node->proc == target_proc) { + if (fp->hdr.type == BINDER_TYPE_HANDLE) + fp->hdr.type = BINDER_TYPE_BINDER; + else + fp->hdr.type = BINDER_TYPE_WEAK_BINDER; + fp->binder = ref->node->ptr; + fp->cookie = ref->node->cookie; + binder_inc_node(ref->node, fp->hdr.type == BINDER_TYPE_BINDER, + 0, NULL); + trace_binder_transaction_ref_to_node(t, ref); + binder_debug(BINDER_DEBUG_TRANSACTION, + " ref %d desc %d -> node %d u%016llx\n", + ref->debug_id, ref->desc, ref->node->debug_id, + (u64)ref->node->ptr); + } else { + struct binder_ref *new_ref; + + new_ref = binder_get_ref_for_node(target_proc, ref->node); + if (!new_ref) + return -EINVAL; + + fp->binder = 0; + fp->handle = new_ref->desc; + fp->cookie = 0; + binder_inc_ref(new_ref, fp->hdr.type == BINDER_TYPE_HANDLE, + NULL); + trace_binder_transaction_ref_to_ref(t, ref, new_ref); + binder_debug(BINDER_DEBUG_TRANSACTION, + " ref %d desc %d -> ref %d desc %d (node %d)\n", + ref->debug_id, ref->desc, new_ref->debug_id, + new_ref->desc, ref->node->debug_id); + } + return 0; +} + +static int binder_translate_fd(int fd, + struct binder_transaction *t, + struct binder_thread *thread, + struct binder_transaction *in_reply_to) +{ + struct binder_proc *proc = thread->proc; + struct binder_proc *target_proc = t->to_proc; + int target_fd; + struct file *file; + int ret; + bool target_allows_fd; + + if (in_reply_to) + target_allows_fd = !!(in_reply_to->flags & TF_ACCEPT_FDS); + else + target_allows_fd = t->buffer->target_node->accept_fds; + if (!target_allows_fd) { + binder_user_error("%d:%d got %s with fd, %d, but target does not allow fds\n", + proc->pid, thread->pid, + in_reply_to ? "reply" : "transaction", + fd); + ret = -EPERM; + goto err_fd_not_accepted; + } + + file = fget(fd); + if (!file) { + binder_user_error("%d:%d got transaction with invalid fd, %d\n", + proc->pid, thread->pid, fd); + ret = -EBADF; + goto err_fget; + } + ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file); + if (ret < 0) { + ret = -EPERM; + goto err_security; + } + + target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC); + if (target_fd < 0) { + ret = -ENOMEM; + goto err_get_unused_fd; + } + task_fd_install(target_proc, target_fd, file); + trace_binder_transaction_fd(t, fd, target_fd); + binder_debug(BINDER_DEBUG_TRANSACTION, " fd %d -> %d\n", + fd, target_fd); + + return target_fd; + +err_get_unused_fd: +err_security: + fput(file); +err_fget: +err_fd_not_accepted: + return ret; +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply) { + int ret; struct binder_transaction *t; struct binder_work *tcomplete; binder_size_t *offp, *off_end; @@ -1623,157 +1785,35 @@ static void binder_transaction(struct binder_proc *proc, case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct flat_binder_object *fp; - struct binder_node *node; - struct binder_ref *ref; fp = to_flat_binder_object(hdr); - node = binder_get_node(proc, fp->binder); - if (node == NULL) { - node = binder_new_node(proc, fp->binder, fp->cookie); - if (node == NULL) { - return_error = BR_FAILED_REPLY; - goto err_binder_new_node_failed; - } - node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK; - node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); - } - if (fp->cookie != node->cookie) { - binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", - proc->pid, thread->pid, - (u64)fp->binder, node->debug_id, - (u64)fp->cookie, (u64)node->cookie); + ret = binder_translate_binder(fp, t, thread); + if (ret < 0) { return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; + goto err_translate_failed; } - if (security_binder_transfer_binder(proc->tsk, - target_proc->tsk)) { - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; - } - ref = binder_get_ref_for_node(target_proc, node); - if (ref == NULL) { - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; - } - if (hdr->type == BINDER_TYPE_BINDER) - hdr->type = BINDER_TYPE_HANDLE; - else - hdr->type = BINDER_TYPE_WEAK_HANDLE; - fp->binder = 0; - fp->handle = ref->desc; - fp->cookie = 0; - binder_inc_ref(ref, hdr->type == BINDER_TYPE_HANDLE, - &thread->todo); - - trace_binder_transaction_node_to_ref(t, node, ref); - binder_debug(BINDER_DEBUG_TRANSACTION, - " node %d u%016llx -> ref %d desc %d\n", - node->debug_id, (u64)node->ptr, - ref->debug_id, ref->desc); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; - struct binder_ref *ref; fp = to_flat_binder_object(hdr); - ref = binder_get_ref(proc, fp->handle, - hdr->type == BINDER_TYPE_HANDLE); - if (ref == NULL) { - binder_user_error("%d:%d got transaction with invalid handle, %d\n", - proc->pid, - thread->pid, fp->handle); + ret = binder_translate_handle(fp, t, thread); + if (ret < 0) { return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_failed; - } - if (security_binder_transfer_binder(proc->tsk, - target_proc->tsk)) { - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_failed; - } - if (ref->node->proc == target_proc) { - if (hdr->type == BINDER_TYPE_HANDLE) - hdr->type = BINDER_TYPE_BINDER; - else - hdr->type = BINDER_TYPE_WEAK_BINDER; - fp->binder = ref->node->ptr; - fp->cookie = ref->node->cookie; - binder_inc_node(ref->node, - hdr->type == BINDER_TYPE_BINDER, - 0, NULL); - trace_binder_transaction_ref_to_node(t, ref); - binder_debug(BINDER_DEBUG_TRANSACTION, - " ref %d desc %d -> node %d u%016llx\n", - ref->debug_id, ref->desc, ref->node->debug_id, - (u64)ref->node->ptr); - } else { - struct binder_ref *new_ref; - - new_ref = binder_get_ref_for_node(target_proc, ref->node); - if (new_ref == NULL) { - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; - } - fp->binder = 0; - fp->handle = new_ref->desc; - fp->cookie = 0; - binder_inc_ref(new_ref, - hdr->type == BINDER_TYPE_HANDLE, - NULL); - trace_binder_transaction_ref_to_ref(t, ref, - new_ref); - binder_debug(BINDER_DEBUG_TRANSACTION, - " ref %d desc %d -> ref %d desc %d (node %d)\n", - ref->debug_id, ref->desc, new_ref->debug_id, - new_ref->desc, ref->node->debug_id); + goto err_translate_failed; } } break; case BINDER_TYPE_FD: { - int target_fd; - struct file *file; struct binder_fd_object *fp = to_binder_fd_object(hdr); + int target_fd = binder_translate_fd(fp->fd, t, thread, + in_reply_to); - if (reply) { - if (!(in_reply_to->flags & TF_ACCEPT_FDS)) { - binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n", - proc->pid, thread->pid, fp->fd); - return_error = BR_FAILED_REPLY; - goto err_fd_not_allowed; - } - } else if (!target_node->accept_fds) { - binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n", - proc->pid, thread->pid, fp->fd); - return_error = BR_FAILED_REPLY; - goto err_fd_not_allowed; - } - - file = fget(fp->fd); - if (file == NULL) { - binder_user_error("%d:%d got transaction with invalid fd, %d\n", - proc->pid, thread->pid, fp->fd); - return_error = BR_FAILED_REPLY; - goto err_fget_failed; - } - if (security_binder_transfer_file(proc->tsk, - target_proc->tsk, - file) < 0) { - fput(file); - return_error = BR_FAILED_REPLY; - goto err_get_unused_fd_failed; - } - target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC); if (target_fd < 0) { - fput(file); return_error = BR_FAILED_REPLY; - goto err_get_unused_fd_failed; + goto err_translate_failed; } - task_fd_install(target_proc, target_fd, file); - trace_binder_transaction_fd(t, fp->fd, target_fd); - binder_debug(BINDER_DEBUG_TRANSACTION, - " fd %d -> %d\n", fp->fd, - target_fd); - /* TODO: fput? */ fp->pad_binder = 0; fp->fd = target_fd; } break; @@ -1810,12 +1850,7 @@ static void binder_transaction(struct binder_proc *proc, wake_up_interruptible(target_wait); return; -err_get_unused_fd_failed: -err_fget_failed: -err_fd_not_allowed: -err_binder_get_ref_for_node_failed: -err_binder_get_ref_failed: -err_binder_new_node_failed: +err_translate_failed: err_bad_object_type: err_bad_offset: err_copy_data_failed: From 843a25788dba57df35820bcc91796fb30b56335c Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 30 Sep 2016 14:05:40 +0200 Subject: [PATCH 0528/3220] android: binder: add extra size to allocator. The binder_buffer allocator currently only allocates space for the data and offsets buffers of a Parcel. This change allows for requesting an additional chunk of data in the buffer, which can for example be used to hold additional meta-data about the transaction (eg a security context). Change-Id: I58ab9c383a2e1a3057aae6adaa596ce867f1b157 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 41 +++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index ef42b10c41ad..83a6c1bbbad5 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -303,6 +303,7 @@ struct binder_buffer { struct binder_node *target_node; size_t data_size; size_t offsets_size; + size_t extra_buffers_size; uint8_t data[0]; }; @@ -670,7 +671,9 @@ err_no_vma: static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, size_t data_size, - size_t offsets_size, int is_async) + size_t offsets_size, + size_t extra_buffers_size, + int is_async) { struct rb_node *n = proc->free_buffers.rb_node; struct binder_buffer *buffer; @@ -678,7 +681,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, struct rb_node *best_fit = NULL; void *has_page_addr; void *end_page_addr; - size_t size; + size_t size, data_offsets_size; if (proc->vma == NULL) { pr_err("%d: binder_alloc_buf, no vma\n", @@ -686,15 +689,20 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, return NULL; } - size = ALIGN(data_size, sizeof(void *)) + + data_offsets_size = ALIGN(data_size, sizeof(void *)) + ALIGN(offsets_size, sizeof(void *)); - if (size < data_size || size < offsets_size) { + if (data_offsets_size < data_size || data_offsets_size < offsets_size) { binder_user_error("%d: got transaction with invalid size %zd-%zd\n", proc->pid, data_size, offsets_size); return NULL; } - + size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *)); + if (size < data_offsets_size || size < extra_buffers_size) { + binder_user_error("%d: got transaction with invalid extra_buffers_size %zd\n", + proc->pid, extra_buffers_size); + return NULL; + } if (is_async && proc->free_async_space < size + sizeof(struct binder_buffer)) { binder_debug(BINDER_DEBUG_BUFFER_ALLOC, @@ -763,6 +771,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, proc->pid, size, buffer); buffer->data_size = data_size; buffer->offsets_size = offsets_size; + buffer->extra_buffers_size = extra_buffers_size; buffer->async_transaction = is_async; if (is_async) { proc->free_async_space -= size + sizeof(struct binder_buffer); @@ -837,7 +846,8 @@ static void binder_free_buf(struct binder_proc *proc, buffer_size = binder_buffer_size(proc, buffer); size = ALIGN(buffer->data_size, sizeof(void *)) + - ALIGN(buffer->offsets_size, sizeof(void *)); + ALIGN(buffer->offsets_size, sizeof(void *)) + + ALIGN(buffer->extra_buffers_size, sizeof(void *)); binder_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: binder_free_buf %p size %zd buffer_size %zd\n", @@ -1555,7 +1565,8 @@ err_fd_not_accepted: static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, - struct binder_transaction_data *tr, int reply) + struct binder_transaction_data *tr, int reply, + binder_size_t extra_buffers_size) { int ret; struct binder_transaction *t; @@ -1699,20 +1710,22 @@ static void binder_transaction(struct binder_proc *proc, if (reply) binder_debug(BINDER_DEBUG_TRANSACTION, - "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld\n", + "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_thread->pid, (u64)tr->data.ptr.buffer, (u64)tr->data.ptr.offsets, - (u64)tr->data_size, (u64)tr->offsets_size); + (u64)tr->data_size, (u64)tr->offsets_size, + (u64)extra_buffers_size); else binder_debug(BINDER_DEBUG_TRANSACTION, - "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld\n", + "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_node->debug_id, (u64)tr->data.ptr.buffer, (u64)tr->data.ptr.offsets, - (u64)tr->data_size, (u64)tr->offsets_size); + (u64)tr->data_size, (u64)tr->offsets_size, + (u64)extra_buffers_size); if (!reply && !(tr->flags & TF_ONE_WAY)) t->from = thread; @@ -1728,7 +1741,8 @@ static void binder_transaction(struct binder_proc *proc, trace_binder_transaction(reply, t, target_node); t->buffer = binder_alloc_buf(target_proc, tr->data_size, - tr->offsets_size, !reply && (t->flags & TF_ONE_WAY)); + tr->offsets_size, extra_buffers_size, + !reply && (t->flags & TF_ONE_WAY)); if (t->buffer == NULL) { return_error = BR_FAILED_REPLY; goto err_binder_alloc_buf_failed; @@ -2078,7 +2092,8 @@ static int binder_thread_write(struct binder_proc *proc, if (copy_from_user(&tr, ptr, sizeof(tr))) return -EFAULT; ptr += sizeof(tr); - binder_transaction(proc, thread, &tr, cmd == BC_REPLY); + binder_transaction(proc, thread, &tr, + cmd == BC_REPLY, 0); break; } From dd9bc4f9f144a89d52a197fcf0ea033a287b7095 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 30 Sep 2016 14:10:07 +0200 Subject: [PATCH 0529/3220] android: binder: support for scatter-gather. Previously all data passed over binder needed to be serialized, with the exception of Binder objects and file descriptors. This patchs adds support for scatter-gathering raw memory buffers into a binder transaction, avoiding the need to first serialize them into a Parcel. To remain backwards compatibile with existing binder clients, it introduces two new command ioctls for this purpose - BC_TRANSACTION_SG and BC_REPLY_SG. These commands may only be used with the new binder_transaction_data_sg structure, which adds a field for the total size of the buffers we are scatter-gathering. Because memory buffers may contain pointers to other buffers, we allow callers to specify a parent buffer and an offset into it, to indicate this is a location pointing to the buffer that we are fixing up. The kernel will then take care of fixing up the pointer to that buffer as well. Change-Id: I02417f28cff14688f2e1d6fcb959438fd96566cc Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 242 ++++++++++++++++++++++++++-- include/uapi/linux/android/binder.h | 45 ++++++ 2 files changed, 275 insertions(+), 12 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 83a6c1bbbad5..a0f4d29e5148 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -153,6 +153,9 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, #define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr) +#define to_binder_buffer_object(hdr) \ + container_of(hdr, struct binder_buffer_object, hdr) + enum binder_stat_types { BINDER_STAT_PROC, BINDER_STAT_THREAD, @@ -166,7 +169,7 @@ enum binder_stat_types { struct binder_stats { int br[_IOC_NR(BR_FAILED_REPLY) + 1]; - int bc[_IOC_NR(BC_DEAD_BINDER_DONE) + 1]; + int bc[_IOC_NR(BC_REPLY_SG) + 1]; int obj_created[BINDER_STAT_COUNT]; int obj_deleted[BINDER_STAT_COUNT]; }; @@ -1306,6 +1309,9 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset) case BINDER_TYPE_FD: object_size = sizeof(struct binder_fd_object); break; + case BINDER_TYPE_PTR: + object_size = sizeof(struct binder_buffer_object); + break; default: return 0; } @@ -1316,11 +1322,111 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset) return 0; } +/** + * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer. + * @b: binder_buffer containing the object + * @index: index in offset array at which the binder_buffer_object is + * located + * @start: points to the start of the offset array + * @num_valid: the number of valid offsets in the offset array + * + * Return: If @index is within the valid range of the offset array + * described by @start and @num_valid, and if there's a valid + * binder_buffer_object at the offset found in index @index + * of the offset array, that object is returned. Otherwise, + * %NULL is returned. + * Note that the offset found in index @index itself is not + * verified; this function assumes that @num_valid elements + * from @start were previously verified to have valid offsets. + */ +static struct binder_buffer_object *binder_validate_ptr(struct binder_buffer *b, + binder_size_t index, + binder_size_t *start, + binder_size_t num_valid) +{ + struct binder_buffer_object *buffer_obj; + binder_size_t *offp; + + if (index >= num_valid) + return NULL; + + offp = start + index; + buffer_obj = (struct binder_buffer_object *)(b->data + *offp); + if (buffer_obj->hdr.type != BINDER_TYPE_PTR) + return NULL; + + return buffer_obj; +} + +/** + * binder_validate_fixup() - validates pointer/fd fixups happen in order. + * @b: transaction buffer + * @objects_start start of objects buffer + * @buffer: binder_buffer_object in which to fix up + * @offset: start offset in @buffer to fix up + * @last_obj: last binder_buffer_object that we fixed up in + * @last_min_offset: minimum fixup offset in @last_obj + * + * Return: %true if a fixup in buffer @buffer at offset @offset is + * allowed. + * + * For safety reasons, we only allow fixups inside a buffer to happen + * at increasing offsets; additionally, we only allow fixup on the last + * buffer object that was verified, or one of its parents. + * + * Example of what is allowed: + * + * A + * B (parent = A, offset = 0) + * C (parent = A, offset = 16) + * D (parent = C, offset = 0) + * E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset) + * + * Examples of what is not allowed: + * + * Decreasing offsets within the same parent: + * A + * C (parent = A, offset = 16) + * B (parent = A, offset = 0) // decreasing offset within A + * + * Referring to a parent that wasn't the last object or any of its parents: + * A + * B (parent = A, offset = 0) + * C (parent = A, offset = 0) + * C (parent = A, offset = 16) + * D (parent = B, offset = 0) // B is not A or any of A's parents + */ +static bool binder_validate_fixup(struct binder_buffer *b, + binder_size_t *objects_start, + struct binder_buffer_object *buffer, + binder_size_t fixup_offset, + struct binder_buffer_object *last_obj, + binder_size_t last_min_offset) +{ + if (!last_obj) { + /* Nothing to fix up in */ + return false; + } + + while (last_obj != buffer) { + /* + * Safe to retrieve the parent of last_obj, since it + * was already previously verified by the driver. + */ + if ((last_obj->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0) + return false; + last_min_offset = last_obj->parent_offset + sizeof(uintptr_t); + last_obj = (struct binder_buffer_object *) + (b->data + *(objects_start + last_obj->parent)); + } + return (fixup_offset >= last_min_offset); +} + static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_buffer *buffer, binder_size_t *failed_at) { - binder_size_t *offp, *off_end; + binder_size_t *offp, *off_start, *off_end; int debug_id = buffer->debug_id; binder_debug(BINDER_DEBUG_TRANSACTION, @@ -1331,13 +1437,13 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); - offp = (binder_size_t *)(buffer->data + - ALIGN(buffer->data_size, sizeof(void *))); + off_start = (binder_size_t *)(buffer->data + + ALIGN(buffer->data_size, sizeof(void *))); if (failed_at) off_end = failed_at; else - off_end = (void *)offp + buffer->offsets_size; - for (; offp < off_end; offp++) { + off_end = (void *)off_start + buffer->offsets_size; + for (offp = off_start; offp < off_end; offp++) { struct binder_object_header *hdr; size_t object_size = binder_validate_object(buffer, *offp); @@ -1393,7 +1499,12 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, if (failed_at) task_close_fd(proc, fp->fd); } break; - + case BINDER_TYPE_PTR: + /* + * Nothing to do here, this will get cleaned up when the + * transaction buffer gets freed + */ + break; default: pr_err("transaction release %d bad object type %x\n", debug_id, hdr->type); @@ -1563,6 +1674,53 @@ err_fd_not_accepted: return ret; } +static int binder_fixup_parent(struct binder_transaction *t, + struct binder_thread *thread, + struct binder_buffer_object *bp, + binder_size_t *off_start, + binder_size_t num_valid, + struct binder_buffer_object *last_fixup_obj, + binder_size_t last_fixup_min_off) +{ + struct binder_buffer_object *parent; + u8 *parent_buffer; + struct binder_buffer *b = t->buffer; + struct binder_proc *proc = thread->proc; + struct binder_proc *target_proc = t->to_proc; + + if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT)) + return 0; + + parent = binder_validate_ptr(b, bp->parent, off_start, num_valid); + if (!parent) { + binder_user_error("%d:%d got transaction with invalid parent offset or type\n", + proc->pid, thread->pid); + return -EINVAL; + } + + if (!binder_validate_fixup(b, off_start, + parent, bp->parent_offset, + last_fixup_obj, + last_fixup_min_off)) { + binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", + proc->pid, thread->pid); + return -EINVAL; + } + + if (parent->length < sizeof(binder_uintptr_t) || + bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) { + /* No space for a pointer here! */ + binder_user_error("%d:%d got transaction with invalid parent offset\n", + proc->pid, thread->pid); + return -EINVAL; + } + parent_buffer = (u8 *)(parent->buffer - + target_proc->user_buffer_offset); + *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer; + + return 0; +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, @@ -1571,8 +1729,9 @@ static void binder_transaction(struct binder_proc *proc, int ret; struct binder_transaction *t; struct binder_work *tcomplete; - binder_size_t *offp, *off_end; + binder_size_t *offp, *off_end, *off_start; binder_size_t off_min; + u8 *sg_bufp, *sg_buf_end; struct binder_proc *target_proc; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; @@ -1581,6 +1740,8 @@ static void binder_transaction(struct binder_proc *proc, struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; uint32_t return_error; + struct binder_buffer_object *last_fixup_obj = NULL; + binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; e = binder_transaction_log_add(&binder_transaction_log); @@ -1755,8 +1916,9 @@ static void binder_transaction(struct binder_proc *proc, if (target_node) binder_inc_node(target_node, 1, 0, NULL); - offp = (binder_size_t *)(t->buffer->data + - ALIGN(tr->data_size, sizeof(void *))); + off_start = (binder_size_t *)(t->buffer->data + + ALIGN(tr->data_size, sizeof(void *))); + offp = off_start; if (copy_from_user(t->buffer->data, (const void __user *)(uintptr_t) tr->data.ptr.buffer, tr->data_size)) { @@ -1778,7 +1940,16 @@ static void binder_transaction(struct binder_proc *proc, return_error = BR_FAILED_REPLY; goto err_bad_offset; } - off_end = (void *)offp + tr->offsets_size; + if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) { + binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n", + proc->pid, thread->pid, + extra_buffers_size); + return_error = BR_FAILED_REPLY; + goto err_bad_offset; + } + off_end = (void *)off_start + tr->offsets_size; + sg_bufp = (u8 *)(PTR_ALIGN(off_end, sizeof(void *))); + sg_buf_end = sg_bufp + extra_buffers_size; off_min = 0; for (; offp < off_end; offp++) { struct binder_object_header *hdr; @@ -1831,7 +2002,41 @@ static void binder_transaction(struct binder_proc *proc, fp->pad_binder = 0; fp->fd = target_fd; } break; + case BINDER_TYPE_PTR: { + struct binder_buffer_object *bp = + to_binder_buffer_object(hdr); + size_t buf_left = sg_buf_end - sg_bufp; + if (bp->length > buf_left) { + binder_user_error("%d:%d got transaction with too large buffer\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_bad_offset; + } + if (copy_from_user(sg_bufp, + (const void __user *)(uintptr_t) + bp->buffer, bp->length)) { + binder_user_error("%d:%d got transaction with invalid offsets ptr\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_copy_data_failed; + } + /* Fixup buffer pointer to target proc address space */ + bp->buffer = (uintptr_t)sg_bufp + + target_proc->user_buffer_offset; + sg_bufp += ALIGN(bp->length, sizeof(u64)); + + ret = binder_fixup_parent(t, thread, bp, off_start, + offp - off_start, + last_fixup_obj, + last_fixup_min_off); + if (ret < 0) { + return_error = BR_FAILED_REPLY; + goto err_translate_failed; + } + last_fixup_obj = bp; + last_fixup_min_off = 0; + } break; default: binder_user_error("%d:%d got transaction with invalid object type, %x\n", proc->pid, thread->pid, hdr->type); @@ -2085,6 +2290,17 @@ static int binder_thread_write(struct binder_proc *proc, break; } + case BC_TRANSACTION_SG: + case BC_REPLY_SG: { + struct binder_transaction_data_sg tr; + + if (copy_from_user(&tr, ptr, sizeof(tr))) + return -EFAULT; + ptr += sizeof(tr); + binder_transaction(proc, thread, &tr.transaction_data, + cmd == BC_REPLY_SG, tr.buffers_size); + break; + } case BC_TRANSACTION: case BC_REPLY: { struct binder_transaction_data tr; @@ -3606,7 +3822,9 @@ static const char * const binder_command_strings[] = { "BC_EXIT_LOOPER", "BC_REQUEST_DEATH_NOTIFICATION", "BC_CLEAR_DEATH_NOTIFICATION", - "BC_DEAD_BINDER_DONE" + "BC_DEAD_BINDER_DONE", + "BC_TRANSACTION_SG", + "BC_REPLY_SG", }; static const char * const binder_objstat_strings[] = { diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index f67c2b1c0713..f3ef6e2634ba 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -33,6 +33,7 @@ enum { BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE), BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE), BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE), + BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE), }; enum { @@ -95,6 +96,39 @@ struct binder_fd_object { binder_uintptr_t cookie; }; + +/* struct binder_buffer_object - object describing a userspace buffer + * @hdr: common header structure + * @flags: one or more BINDER_BUFFER_* flags + * @buffer: address of the buffer + * @length: length of the buffer + * @parent: index in offset array pointing to parent buffer + * @parent_offset: offset in @parent pointing to this buffer + * + * A binder_buffer object represents an object that the + * binder kernel driver can copy verbatim to the target + * address space. A buffer itself may be pointed to from + * within another buffer, meaning that the pointer inside + * that other buffer needs to be fixed up as well. This + * can be done by setting the BINDER_BUFFER_FLAG_HAS_PARENT + * flag in @flags, by setting @parent buffer to the index + * in the offset array pointing to the parent binder_buffer_object, + * and by setting @parent_offset to the offset in the parent buffer + * at which the pointer to this buffer is located. + */ +struct binder_buffer_object { + struct binder_object_header hdr; + __u32 flags; + binder_uintptr_t buffer; + binder_size_t length; + binder_size_t parent; + binder_size_t parent_offset; +}; + +enum { + BINDER_BUFFER_FLAG_HAS_PARENT = 0x01, +}; + /* * On 64-bit platforms where user code may run in 32-bits the driver must * translate the buffer (and local binder) addresses appropriately. @@ -187,6 +221,11 @@ struct binder_transaction_data { } data; }; +struct binder_transaction_data_sg { + struct binder_transaction_data transaction_data; + binder_size_t buffers_size; +}; + struct binder_ptr_cookie { binder_uintptr_t ptr; binder_uintptr_t cookie; @@ -371,6 +410,12 @@ enum binder_driver_command_protocol { /* * void *: cookie */ + + BC_TRANSACTION_SG = _IOW('c', 17, struct binder_transaction_data_sg), + BC_REPLY_SG = _IOW('c', 18, struct binder_transaction_data_sg), + /* + * binder_transaction_data_sg: the sent command. + */ }; #endif /* _UAPI_LINUX_BINDER_H */ From e124de382771254e09bb2bbd2ac753cdae71a36d Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Tue, 18 Oct 2016 13:58:55 +0200 Subject: [PATCH 0530/3220] android: binder: support for file-descriptor arrays. This patch introduces a new binder_fd_array object, that allows us to support one or more file descriptors embedded in a buffer that is scatter-gathered. Change-Id: I647a53cf0d905c7be0dfd9333806982def68dd74 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 137 ++++++++++++++++++++++++++++ include/uapi/linux/android/binder.h | 28 ++++++ 2 files changed, 165 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index a0f4d29e5148..951393825261 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -156,6 +156,9 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, #define to_binder_buffer_object(hdr) \ container_of(hdr, struct binder_buffer_object, hdr) +#define to_binder_fd_array_object(hdr) \ + container_of(hdr, struct binder_fd_array_object, hdr) + enum binder_stat_types { BINDER_STAT_PROC, BINDER_STAT_THREAD, @@ -1312,6 +1315,9 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset) case BINDER_TYPE_PTR: object_size = sizeof(struct binder_buffer_object); break; + case BINDER_TYPE_FDA: + object_size = sizeof(struct binder_fd_array_object); + break; default: return 0; } @@ -1505,6 +1511,47 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, * transaction buffer gets freed */ break; + case BINDER_TYPE_FDA: { + struct binder_fd_array_object *fda; + struct binder_buffer_object *parent; + uintptr_t parent_buffer; + u32 *fd_array; + size_t fd_index; + binder_size_t fd_buf_size; + + fda = to_binder_fd_array_object(hdr); + parent = binder_validate_ptr(buffer, fda->parent, + off_start, + offp - off_start); + if (!parent) { + pr_err("transaction release %d bad parent offset", + debug_id); + continue; + } + /* + * Since the parent was already fixed up, convert it + * back to kernel address space to access it + */ + parent_buffer = parent->buffer - + proc->user_buffer_offset; + + fd_buf_size = sizeof(u32) * fda->num_fds; + if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { + pr_err("transaction release %d invalid number of fds (%lld)\n", + debug_id, (u64)fda->num_fds); + continue; + } + if (fd_buf_size > parent->length || + fda->parent_offset > parent->length - fd_buf_size) { + /* No space for all file descriptors here. */ + pr_err("transaction release %d not enough space for %lld fds in buffer\n", + debug_id, (u64)fda->num_fds); + continue; + } + fd_array = (u32 *)(parent_buffer + fda->parent_offset); + for (fd_index = 0; fd_index < fda->num_fds; fd_index++) + task_close_fd(proc, fd_array[fd_index]); + } break; default: pr_err("transaction release %d bad object type %x\n", debug_id, hdr->type); @@ -1674,6 +1721,63 @@ err_fd_not_accepted: return ret; } +static int binder_translate_fd_array(struct binder_fd_array_object *fda, + struct binder_buffer_object *parent, + struct binder_transaction *t, + struct binder_thread *thread, + struct binder_transaction *in_reply_to) +{ + binder_size_t fdi, fd_buf_size, num_installed_fds; + int target_fd; + uintptr_t parent_buffer; + u32 *fd_array; + struct binder_proc *proc = thread->proc; + struct binder_proc *target_proc = t->to_proc; + + fd_buf_size = sizeof(u32) * fda->num_fds; + if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { + binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n", + proc->pid, thread->pid, (u64)fda->num_fds); + return -EINVAL; + } + if (fd_buf_size > parent->length || + fda->parent_offset > parent->length - fd_buf_size) { + /* No space for all file descriptors here. */ + binder_user_error("%d:%d not enough space to store %lld fds in buffer\n", + proc->pid, thread->pid, (u64)fda->num_fds); + return -EINVAL; + } + /* + * Since the parent was already fixed up, convert it + * back to the kernel address space to access it + */ + parent_buffer = parent->buffer - target_proc->user_buffer_offset; + fd_array = (u32 *)(parent_buffer + fda->parent_offset); + if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) { + binder_user_error("%d:%d parent offset not aligned correctly.\n", + proc->pid, thread->pid); + return -EINVAL; + } + for (fdi = 0; fdi < fda->num_fds; fdi++) { + target_fd = binder_translate_fd(fd_array[fdi], t, thread, + in_reply_to); + if (target_fd < 0) + goto err_translate_fd_failed; + fd_array[fdi] = target_fd; + } + return 0; + +err_translate_fd_failed: + /* + * Failed to allocate fd or security error, free fds + * installed so far. + */ + num_installed_fds = fdi; + for (fdi = 0; fdi < num_installed_fds; fdi++) + task_close_fd(target_proc, fd_array[fdi]); + return target_fd; +} + static int binder_fixup_parent(struct binder_transaction *t, struct binder_thread *thread, struct binder_buffer_object *bp, @@ -2002,6 +2106,38 @@ static void binder_transaction(struct binder_proc *proc, fp->pad_binder = 0; fp->fd = target_fd; } break; + case BINDER_TYPE_FDA: { + struct binder_fd_array_object *fda = + to_binder_fd_array_object(hdr); + struct binder_buffer_object *parent = + binder_validate_ptr(t->buffer, fda->parent, + off_start, + offp - off_start); + if (!parent) { + binder_user_error("%d:%d got transaction with invalid parent offset or type\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_bad_parent; + } + if (!binder_validate_fixup(t->buffer, off_start, + parent, fda->parent_offset, + last_fixup_obj, + last_fixup_min_off)) { + binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_bad_parent; + } + ret = binder_translate_fd_array(fda, parent, t, thread, + in_reply_to); + if (ret < 0) { + return_error = BR_FAILED_REPLY; + goto err_translate_failed; + } + last_fixup_obj = parent; + last_fixup_min_off = + fda->parent_offset + sizeof(u32) * fda->num_fds; + } break; case BINDER_TYPE_PTR: { struct binder_buffer_object *bp = to_binder_buffer_object(hdr); @@ -2072,6 +2208,7 @@ static void binder_transaction(struct binder_proc *proc, err_translate_failed: err_bad_object_type: err_bad_offset: +err_bad_parent: err_copy_data_failed: trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, t->buffer, offp); diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index f3ef6e2634ba..51f891fb1b18 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -33,6 +33,7 @@ enum { BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE), BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE), BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE), + BINDER_TYPE_FDA = B_PACK_CHARS('f', 'd', 'a', B_TYPE_LARGE), BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE), }; @@ -129,6 +130,33 @@ enum { BINDER_BUFFER_FLAG_HAS_PARENT = 0x01, }; +/* struct binder_fd_array_object - object describing an array of fds in a buffer + * @hdr: common header structure + * @num_fds: number of file descriptors in the buffer + * @parent: index in offset array to buffer holding the fd array + * @parent_offset: start offset of fd array in the buffer + * + * A binder_fd_array object represents an array of file + * descriptors embedded in a binder_buffer_object. It is + * different from a regular binder_buffer_object because it + * describes a list of file descriptors to fix up, not an opaque + * blob of memory, and hence the kernel needs to treat it differently. + * + * An example of how this would be used is with Android's + * native_handle_t object, which is a struct with a list of integers + * and a list of file descriptors. The native_handle_t struct itself + * will be represented by a struct binder_buffer_objct, whereas the + * embedded list of file descriptors is represented by a + * struct binder_fd_array_object with that binder_buffer_object as + * a parent. + */ +struct binder_fd_array_object { + struct binder_object_header hdr; + binder_size_t num_fds; + binder_size_t parent; + binder_size_t parent_offset; +}; + /* * On 64-bit platforms where user code may run in 32-bits the driver must * translate the buffer (and local binder) addresses appropriately. From 96009ab737c4e6da44a0fff72d33534583454d03 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 15 Sep 2016 16:05:40 +0530 Subject: [PATCH 0531/3220] ANDROID: usb: gadget: audio_source: fix comparison of distinct pointer types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use div_s64() instead of do_div() to fix following "comparison of distinct pointer types lacks a cast" warning in do_div() call in audio_send() for ARCH=arm in Linux 4.8-rc6: CC drivers/usb/gadget/function/f_audio_source.o In file included from ./arch/arm/include/asm/div64.h:126:0, from ./include/linux/kernel.h:142, from ./include/linux/list.h:8, from ./include/linux/kobject.h:20, from ./include/linux/device.h:17, from drivers/usb/gadget/function/f_audio_source.c:17: drivers/usb/gadget/function/f_audio_source.c: In function ‘audio_send’: ./include/asm-generic/div64.h:207:28: warning: comparison of distinct pointer types lacks a cast (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ ^ drivers/usb/gadget/function/f_audio_source.c:381:2: note: in expansion of macro ‘do_div’ do_div(msecs, 1000000); ^ ./include/asm-generic/div64.h:207:28: warning: comparison of distinct pointer types lacks a cast (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ ^ drivers/usb/gadget/function/f_audio_source.c:383:2: note: in expansion of macro ‘do_div’ do_div(frames, 1000); ^ LD drivers/usb/gadget/function/usb_f_audio_source.o Change-Id: Ie1a920c8948f3fc3f1263add25a402ded132fd66 Signed-off-by: Amit Pundir --- drivers/usb/gadget/function/f_audio_source.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c index bcd817439dbf..2489a5fa2685 100644 --- a/drivers/usb/gadget/function/f_audio_source.c +++ b/drivers/usb/gadget/function/f_audio_source.c @@ -377,10 +377,9 @@ static void audio_send(struct audio_dev *audio) /* compute number of frames to send */ now = ktime_get(); - msecs = ktime_to_ns(now) - ktime_to_ns(audio->start_time); - do_div(msecs, 1000000); - frames = msecs * SAMPLE_RATE; - do_div(frames, 1000); + msecs = div_s64((ktime_to_ns(now) - ktime_to_ns(audio->start_time)), + 1000000); + frames = div_s64((msecs * SAMPLE_RATE), 1000); /* Readjust our frames_sent if we fall too far behind. * If we get too far behind it is better to drop some frames than From 2182c4daf185176ac99bf71a4ae1786ea9620714 Mon Sep 17 00:00:00 2001 From: Jonathan Hamilton Date: Wed, 21 Sep 2016 12:40:51 -0700 Subject: [PATCH 0532/3220] ANDROID: video: adf: Avoid directly referencing user pointers Enabling KASAN on a kernel using ADF causes a number of places where user-supplied pointers to ioctls pointers are directly dereferenced without copy_from_user or access_ok. Bug: 31806036 Signed-off-by: Jonathan Hamilton Change-Id: I6e86237aaa6cec0f6e1c385336aefcc5332080ae --- drivers/video/adf/adf_fops.c | 73 +++++++++++++++--------------------- 1 file changed, 31 insertions(+), 42 deletions(-) diff --git a/drivers/video/adf/adf_fops.c b/drivers/video/adf/adf_fops.c index 8726617f73ab..705411bfaebb 100644 --- a/drivers/video/adf/adf_fops.c +++ b/drivers/video/adf/adf_fops.c @@ -132,7 +132,7 @@ static int adf_eng_get_data(struct adf_overlay_engine *eng, eng->ops->n_supported_formats)); mutex_lock(&dev->client_lock); - ret = adf_obj_copy_custom_data_to_user(&eng->base, arg->custom_data, + ret = adf_obj_copy_custom_data_to_user(&eng->base, data.custom_data, &data.custom_data_size); mutex_unlock(&dev->client_lock); @@ -144,7 +144,7 @@ static int adf_eng_get_data(struct adf_overlay_engine *eng, goto done; } - if (supported_formats && copy_to_user(arg->supported_formats, + if (supported_formats && copy_to_user(data.supported_formats, supported_formats, n_supported_formats * sizeof(supported_formats[0]))) ret = -EFAULT; @@ -220,56 +220,45 @@ static int adf_device_post_config(struct adf_device *dev, int complete_fence_fd; struct adf_buffer *bufs = NULL; struct adf_interface **intfs = NULL; - size_t n_intfs, n_bufs, i; + struct adf_post_config data; + size_t i; void *custom_data = NULL; - size_t custom_data_size; int ret = 0; + if (copy_from_user(&data, arg, sizeof(data))) + return -EFAULT; + complete_fence_fd = get_unused_fd_flags(O_CLOEXEC); if (complete_fence_fd < 0) return complete_fence_fd; - if (get_user(n_intfs, &arg->n_interfaces)) { - ret = -EFAULT; - goto err_get_user; - } - - if (n_intfs > ADF_MAX_INTERFACES) { + if (data.n_interfaces > ADF_MAX_INTERFACES) { ret = -EINVAL; goto err_get_user; } - if (get_user(n_bufs, &arg->n_bufs)) { - ret = -EFAULT; - goto err_get_user; - } - - if (n_bufs > ADF_MAX_BUFFERS) { + if (data.n_bufs > ADF_MAX_BUFFERS) { ret = -EINVAL; goto err_get_user; } - if (get_user(custom_data_size, &arg->custom_data_size)) { - ret = -EFAULT; - goto err_get_user; - } - - if (custom_data_size > ADF_MAX_CUSTOM_DATA_SIZE) { + if (data.custom_data_size > ADF_MAX_CUSTOM_DATA_SIZE) { ret = -EINVAL; goto err_get_user; } - if (n_intfs) { - intfs = kmalloc(sizeof(intfs[0]) * n_intfs, GFP_KERNEL); + if (data.n_interfaces) { + intfs = kmalloc(sizeof(intfs[0]) * data.n_interfaces, + GFP_KERNEL); if (!intfs) { ret = -ENOMEM; goto err_get_user; } } - for (i = 0; i < n_intfs; i++) { + for (i = 0; i < data.n_interfaces; i++) { u32 intf_id; - if (get_user(intf_id, &arg->interfaces[i])) { + if (get_user(intf_id, &data.interfaces[i])) { ret = -EFAULT; goto err_get_user; } @@ -281,31 +270,31 @@ static int adf_device_post_config(struct adf_device *dev, } } - if (n_bufs) { - bufs = kzalloc(sizeof(bufs[0]) * n_bufs, GFP_KERNEL); + if (data.n_bufs) { + bufs = kzalloc(sizeof(bufs[0]) * data.n_bufs, GFP_KERNEL); if (!bufs) { ret = -ENOMEM; goto err_get_user; } } - for (i = 0; i < n_bufs; i++) { - ret = adf_buffer_import(dev, &arg->bufs[i], &bufs[i]); + for (i = 0; i < data.n_bufs; i++) { + ret = adf_buffer_import(dev, &data.bufs[i], &bufs[i]); if (ret < 0) { memset(&bufs[i], 0, sizeof(bufs[i])); goto err_import; } } - if (custom_data_size) { - custom_data = kzalloc(custom_data_size, GFP_KERNEL); + if (data.custom_data_size) { + custom_data = kzalloc(data.custom_data_size, GFP_KERNEL); if (!custom_data) { ret = -ENOMEM; goto err_import; } - if (copy_from_user(custom_data, arg->custom_data, - custom_data_size)) { + if (copy_from_user(custom_data, data.custom_data, + data.custom_data_size)) { ret = -EFAULT; goto err_import; } @@ -316,8 +305,8 @@ static int adf_device_post_config(struct adf_device *dev, goto err_import; } - complete_fence = adf_device_post_nocopy(dev, intfs, n_intfs, bufs, - n_bufs, custom_data, custom_data_size); + complete_fence = adf_device_post_nocopy(dev, intfs, data.n_interfaces, + bufs, data.n_bufs, custom_data, data.custom_data_size); if (IS_ERR(complete_fence)) { ret = PTR_ERR(complete_fence); goto err_import; @@ -327,7 +316,7 @@ static int adf_device_post_config(struct adf_device *dev, return 0; err_import: - for (i = 0; i < n_bufs; i++) + for (i = 0; i < data.n_bufs; i++) adf_buffer_cleanup(&bufs[i]); err_get_user: @@ -481,19 +470,19 @@ static int adf_device_get_data(struct adf_device *dev, data.n_allowed_attachments); mutex_lock(&dev->client_lock); - ret = adf_obj_copy_custom_data_to_user(&dev->base, arg->custom_data, + ret = adf_obj_copy_custom_data_to_user(&dev->base, data.custom_data, &data.custom_data_size); mutex_unlock(&dev->client_lock); if (ret < 0) goto done; - ret = adf_copy_attachment_list_to_user(arg->attachments, + ret = adf_copy_attachment_list_to_user(data.attachments, data.n_attachments, attach, n_attach); if (ret < 0) goto done; - ret = adf_copy_attachment_list_to_user(arg->allowed_attachments, + ret = adf_copy_attachment_list_to_user(data.allowed_attachments, data.n_allowed_attachments, allowed_attach, n_allowed_attach); if (ret < 0) @@ -592,7 +581,7 @@ static int adf_intf_get_data(struct adf_interface *intf, data.n_available_modes = intf->n_modes; read_unlock_irqrestore(&intf->hotplug_modelist_lock, flags); - if (copy_to_user(arg->available_modes, modelist, modelist_size)) { + if (copy_to_user(data.available_modes, modelist, modelist_size)) { ret = -EFAULT; goto done; } @@ -601,7 +590,7 @@ static int adf_intf_get_data(struct adf_interface *intf, memcpy(&data.current_mode, &intf->current_mode, sizeof(intf->current_mode)); - ret = adf_obj_copy_custom_data_to_user(&intf->base, arg->custom_data, + ret = adf_obj_copy_custom_data_to_user(&intf->base, data.custom_data, &data.custom_data_size); done: mutex_unlock(&dev->client_lock); From 0a60e50f0b910612c3f9560fc3a791e70d146b0b Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 11 Aug 2015 12:34:45 +0530 Subject: [PATCH 0533/3220] usb: gadget: f_mtp: simplify ptp NULL pointer check Simplify MTP/PTP dev NULL pointer check introduced in Change-Id: Ic44a699d96df2e13467fc081bff88b97dcc5afb2 and restrict it to MTP/PTP function level only. Return ERR_PTR() instead of NULL from mtp_ptp function to skip doing NULL pointer checks all the way up to configfs.c Fixes: Change-Id: Ic44a699d96df2e13467fc081bff88b97dcc5afb2 ("usb: gadget: fix NULL ptr derefer while symlinking PTP func") Change-Id: Iab7c55089c115550c3506f6cca960a07ae52713d Signed-off-by: Amit Pundir --- drivers/usb/gadget/configfs.c | 5 ----- drivers/usb/gadget/function/f_mtp.c | 2 +- drivers/usb/gadget/functions.c | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 8df96cb3bb58..54849fe9cb01 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -425,11 +425,6 @@ static int config_usb_cfg_link( } f = usb_get_function(fi); - if (f == NULL) { - /* Are we trying to symlink PTP without MTP function? */ - ret = -EINVAL; /* Invalid Configuration */ - goto out; - } if (IS_ERR(f)) { ret = PTR_ERR(f); goto out; diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index 74195be9b054..b9e3e97c57c4 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -1498,7 +1498,7 @@ struct usb_function *function_alloc_mtp_ptp(struct usb_function_instance *fi, pr_err("\t2: Create MTP function\n"); pr_err("\t3: Create and symlink PTP function" " with a gadget configuration\n"); - return NULL; + return ERR_PTR(-EINVAL); /* Invalid Configuration */ } dev = fi_mtp->dev; diff --git a/drivers/usb/gadget/functions.c b/drivers/usb/gadget/functions.c index 389c1f3d0fee..b13f839e7368 100644 --- a/drivers/usb/gadget/functions.c +++ b/drivers/usb/gadget/functions.c @@ -58,7 +58,7 @@ struct usb_function *usb_get_function(struct usb_function_instance *fi) struct usb_function *f; f = fi->fd->alloc_func(fi); - if ((f == NULL) || IS_ERR(f)) + if (IS_ERR(f)) return f; f->fi = fi; return f; From ab2bff9a8f87be3710edef3f2f17dee78962a6a1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 13 May 2016 09:34:12 -0400 Subject: [PATCH 0534/3220] UPSTREAM: ring-buffer: Prevent overflow of size in ring_buffer_resize() (Cherry picked from commit 59643d1535eb220668692a5359de22545af579f6) If the size passed to ring_buffer_resize() is greater than MAX_LONG - BUF_PAGE_SIZE then the DIV_ROUND_UP() will return zero. Here's the details: # echo 18014398509481980 > /sys/kernel/debug/tracing/buffer_size_kb tracing_entries_write() processes this and converts kb to bytes. 18014398509481980 << 10 = 18446744073709547520 and this is passed to ring_buffer_resize() as unsigned long size. size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); Where DIV_ROUND_UP(a, b) is (a + b - 1)/b BUF_PAGE_SIZE is 4080 and here 18446744073709547520 + 4080 - 1 = 18446744073709551599 where 18446744073709551599 is still smaller than 2^64 2^64 - 18446744073709551599 = 17 But now 18446744073709551599 / 4080 = 4521260802379792 and size = size * 4080 = 18446744073709551360 This is checked to make sure its still greater than 2 * 4080, which it is. Then we convert to the number of buffer pages needed. nr_page = DIV_ROUND_UP(size, BUF_PAGE_SIZE) but this time size is 18446744073709551360 and 2^64 - (18446744073709551360 + 4080 - 1) = -3823 Thus it overflows and the resulting number is less than 4080, which makes 3823 / 4080 = 0 an nr_pages is set to this. As we already checked against the minimum that nr_pages may be, this causes the logic to fail as well, and we crash the kernel. There's no reason to have the two DIV_ROUND_UP() (that's just result of historical code changes), clean up the code and fix this bug. Cc: stable@vger.kernel.org # 3.5+ Fixes: 83f40318dab00 ("ring-buffer: Make removal of ring buffer pages atomic") Signed-off-by: Steven Rostedt Signed-off-by: Willy Tarreau Change-Id: I1147672317a3ad0fc995b1f32baaa050a7976ac4 Bug: 32659848 --- kernel/trace/ring_buffer.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c6045a27ba3..7fa26a57a847 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1665,14 +1665,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return size; - size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - size *= BUF_PAGE_SIZE; + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); /* we need a minimum of two pages */ - if (size < BUF_PAGE_SIZE * 2) - size = BUF_PAGE_SIZE * 2; + if (nr_pages < 2) + nr_pages = 2; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size = nr_pages * BUF_PAGE_SIZE; /* * Don't succeed if resizing is disabled, as a reader might be From 40ceb2c69964f8bde97d4ded4306508db16fd365 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 15 Nov 2016 19:25:40 -0800 Subject: [PATCH 0535/3220] usb: gadget: Fix compilation problem with tx_qlen field Change-Id: I38c4f4a850b0329fb4a06b2c7e45558e16d66151 Signed-off-by: Dmitry Shmidt --- drivers/usb/gadget/function/u_ether.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 930f58f0ba32..cc5210a87614 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -746,7 +746,7 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb, req->no_interrupt = (((dev->gadget->speed == USB_SPEED_HIGH || dev->gadget->speed == USB_SPEED_SUPER)) && !list_empty(&dev->tx_reqs)) - ? ((atomic_read(&dev->tx_qlen) % dev->qmult) != 0) + ? ((dev->tx_qlen % dev->qmult) != 0) : 0; retval = usb_ep_queue(in, req, GFP_ATOMIC); From 6dc8c51a76e57ae01fa087bdb5450423ce76a373 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 11:58:52 +0530 Subject: [PATCH 0536/3220] cpufreq: sched: Fix kernel crash on accessing sysfs file If the cpufreq driver hasn't set the CPUFREQ_HAVE_GOVERNOR_PER_POLICY flag, then the kernel will crash on accessing sysfs files for the sched governor. CPUFreq governors we can have the governor specific sysfs files in two places: A. /sys/devices/system/cpu/cpuX/cpufreq/ B. /sys/devices/system/cpu/cpufreq/ The case A. is for governor per policy case, where we can control the governor tunables for each policy separately. The case B. is for system wide tunable values. The schedfreq governor only implements the case A. and not B. The sysfs files in case B will still be present in /sys/devices/system/cpu/cpufreq/, but accessing them will crash kernel as the governor doesn't support that. Moreover the sched governor is pretty new and will be used only for the ARM platforms and there is no need to support the case B at all. Hence use policy->kobj instead of get_governor_parent_kobj(), so that we always create the sysfs files in path A. Signed-off-by: Viresh Kumar --- kernel/sched/cpufreq_sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index f6f9b9b3a4a8..d751bc2d0d6e 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -289,7 +289,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->up_throttle_nsec); - rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + rc = sysfs_create_group(&policy->kobj, get_sysfs_attr()); if (rc) { pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); goto err; @@ -332,7 +332,7 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) put_task_struct(gd->task); } - sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); + sysfs_remove_group(&policy->kobj, get_sysfs_attr()); policy->governor_data = NULL; From ff458c4fca9797d314a024bad05b59831eff3f58 Mon Sep 17 00:00:00 2001 From: Anson Jacob Date: Fri, 11 Nov 2016 01:10:04 -0500 Subject: [PATCH 0537/3220] ANDROID: usb: gadget: function: cleanup: Add blank line after declaration Fix warning generated by checkpatch.pl: Missing a blank line after declarations Change-Id: Id129bb8cc8fa37c67a647e2e5996bb2817020e65 Signed-off-by: Anson Jacob --- drivers/usb/gadget/function/f_accessory.c | 2 ++ drivers/usb/gadget/function/f_audio_source.c | 1 + drivers/usb/gadget/function/f_mtp.c | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index 2ca16a577542..9d3ec0e37475 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -211,6 +211,7 @@ static inline struct acc_dev *func_to_dev(struct usb_function *f) static struct usb_request *acc_request_new(struct usb_ep *ep, int buffer_size) { struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL); + if (!req) return NULL; @@ -1021,6 +1022,7 @@ acc_function_unbind(struct usb_configuration *c, struct usb_function *f) static void acc_start_work(struct work_struct *data) { char *envp[2] = { "ACCESSORY=START", NULL }; + kobject_uevent_env(&acc_device.this_device->kobj, KOBJ_CHANGE, envp); } diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c index 2489a5fa2685..db7903d19c43 100644 --- a/drivers/usb/gadget/function/f_audio_source.c +++ b/drivers/usb/gadget/function/f_audio_source.c @@ -310,6 +310,7 @@ static struct device_attribute *audio_source_function_attributes[] = { static struct usb_request *audio_request_new(struct usb_ep *ep, int buffer_size) { struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL); + if (!req) return NULL; diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index b9e3e97c57c4..d8b69af6e335 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -361,6 +361,7 @@ static inline struct mtp_dev *func_to_mtp(struct usb_function *f) static struct usb_request *mtp_request_new(struct usb_ep *ep, int buffer_size) { struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL); + if (!req) return NULL; @@ -1121,6 +1122,7 @@ static int mtp_ctrlrequest(struct usb_composite_dev *cdev, } else if (ctrl->bRequest == MTP_REQ_GET_DEVICE_STATUS && w_index == 0 && w_value == 0) { struct mtp_device_status *status = cdev->req->buf; + status->wLength = __constant_cpu_to_le16(sizeof(*status)); @@ -1143,6 +1145,7 @@ static int mtp_ctrlrequest(struct usb_composite_dev *cdev, /* respond with data transfer or status phase? */ if (value >= 0) { int rc; + cdev->req->zero = value < w_length; cdev->req->length = value; rc = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC); @@ -1378,6 +1381,7 @@ static struct mtp_instance *to_mtp_instance(struct config_item *item) static void mtp_attr_release(struct config_item *item) { struct mtp_instance *fi_mtp = to_mtp_instance(item); + usb_put_function_instance(&fi_mtp->func_inst); } From 1a8b993d92021b4f81da4b219a4214bdec06f541 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 26 Feb 2016 19:00:03 +0000 Subject: [PATCH 0538/3220] BACKPORT: staging: goldfish: audio: add devicetree bindings Introduce devicetree bindings to the Goldfish staging audio driver. Signed-off-by: Greg Hackmann Signed-off-by: Jin Qian Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 283ded10312a3b75e384313f6f529ec2c636cf2c) Change-Id: Ib75d3a4cac7353084a8da18a96fb298a759bacc0 --- .../devicetree/bindings/goldfish/audio.txt | 17 +++++++++++++++++ drivers/staging/goldfish/goldfish_audio.c | 9 ++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/goldfish/audio.txt diff --git a/Documentation/devicetree/bindings/goldfish/audio.txt b/Documentation/devicetree/bindings/goldfish/audio.txt new file mode 100644 index 000000000000..d043fda433ba --- /dev/null +++ b/Documentation/devicetree/bindings/goldfish/audio.txt @@ -0,0 +1,17 @@ +Android Goldfish Audio + +Android goldfish audio device generated by android emulator. + +Required properties: + +- compatible : should contain "google,goldfish-audio" to match emulator +- reg : +- interrupts : + +Example: + + goldfish_audio@9030000 { + compatible = "google,goldfish-audio"; + reg = <0x9030000 0x100>; + interrupts = <0x4>; + }; diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c index b0927e49d0a8..8841680ced9f 100644 --- a/drivers/staging/goldfish/goldfish_audio.c +++ b/drivers/staging/goldfish/goldfish_audio.c @@ -344,11 +344,18 @@ static int goldfish_audio_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id goldfish_audio_of_match[] = { + { .compatible = "google,goldfish-audio", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_audio_of_match); + static struct platform_driver goldfish_audio_driver = { .probe = goldfish_audio_probe, .remove = goldfish_audio_remove, .driver = { - .name = "goldfish_audio" + .name = "goldfish_audio", + .of_match_table = goldfish_audio_of_match, } }; From ac4d96f09a06c127d9d7509efbe6403a6b19d208 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 26 Feb 2016 18:45:30 +0000 Subject: [PATCH 0539/3220] BACKPORT: power: goldfish_battery: add devicetree bindings Add device tree bindings to the Goldfish virtual platform battery drivers. Signed-off-by: Greg Hackmann Signed-off-by: Jin Qian Signed-off-by: Alan Cox Signed-off-by: Sebastian Reichel (cherry picked from commit 65d687a7b7d6f27e4306fe8cc8a1ca66a1a760f6) Change-Id: If947ea3341ff0cb713c56e14d18d51a3f5912b64 --- .../devicetree/bindings/goldfish/battery.txt | 17 +++++++++++++++++ drivers/power/goldfish_battery.c | 9 ++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/goldfish/battery.txt diff --git a/Documentation/devicetree/bindings/goldfish/battery.txt b/Documentation/devicetree/bindings/goldfish/battery.txt new file mode 100644 index 000000000000..4fb613933214 --- /dev/null +++ b/Documentation/devicetree/bindings/goldfish/battery.txt @@ -0,0 +1,17 @@ +Android Goldfish Battery + +Android goldfish battery device generated by android emulator. + +Required properties: + +- compatible : should contain "google,goldfish-battery" to match emulator +- reg : +- interrupts : + +Example: + + goldfish_battery@9020000 { + compatible = "google,goldfish-battery"; + reg = <0x9020000 0x1000>; + interrupts = <0x3>; + }; diff --git a/drivers/power/goldfish_battery.c b/drivers/power/goldfish_battery.c index a50bb988c69a..7510796e190c 100644 --- a/drivers/power/goldfish_battery.c +++ b/drivers/power/goldfish_battery.c @@ -227,11 +227,18 @@ static int goldfish_battery_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id goldfish_battery_of_match[] = { + { .compatible = "google,goldfish-battery", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_battery_of_match); + static struct platform_driver goldfish_battery_device = { .probe = goldfish_battery_probe, .remove = goldfish_battery_remove, .driver = { - .name = "goldfish-battery" + .name = "goldfish-battery", + .of_match_table = goldfish_battery_of_match, } }; module_platform_driver(goldfish_battery_device); From aec62280ca9d6091b08c979284ae41379549d506 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 26 Feb 2016 12:05:02 -0800 Subject: [PATCH 0540/3220] BACKPORT: Input: goldfish_events - add devicetree bindings Add device tree bindings to the Goldfish virtual platform event driver. Signed-off-by: Greg Hackmann Signed-off-by: Jin Qian Signed-off-by: Alan Signed-off-by: Dmitry Torokhov (cherry picked from commit 8c5dc5a1ada2b79259e55a4bd150135d23529c6a) Change-Id: I677d8e0d92294f53f7cc5a79300b6462b65e8aad --- .../devicetree/bindings/goldfish/events.txt | 17 +++++++++++++++++ drivers/input/keyboard/goldfish_events.c | 7 +++++++ 2 files changed, 24 insertions(+) create mode 100644 Documentation/devicetree/bindings/goldfish/events.txt diff --git a/Documentation/devicetree/bindings/goldfish/events.txt b/Documentation/devicetree/bindings/goldfish/events.txt new file mode 100644 index 000000000000..5babf46317a4 --- /dev/null +++ b/Documentation/devicetree/bindings/goldfish/events.txt @@ -0,0 +1,17 @@ +Android Goldfish Events Keypad + +Android goldfish events keypad device generated by android emulator. + +Required properties: + +- compatible : should contain "google,goldfish-events-keypad" to match emulator +- reg : +- interrupts : + +Example: + + goldfish-events@9040000 { + compatible = "google,goldfish-events-keypad"; + reg = <0x9040000 0x1000>; + interrupts = <0x5>; + }; diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c index 907e4e278fce..b11d218604a7 100644 --- a/drivers/input/keyboard/goldfish_events.c +++ b/drivers/input/keyboard/goldfish_events.c @@ -178,10 +178,17 @@ static int events_probe(struct platform_device *pdev) return 0; } +static const struct of_device_id goldfish_events_of_match[] = { + { .compatible = "google,goldfish-events-keypad", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_events_of_match); + static struct platform_driver events_driver = { .probe = events_probe, .driver = { .name = "goldfish_events", + .of_match_table = goldfish_events_of_match, }, }; From 3f5a380007fec7d4f6a0f361cf9da78a206e53c1 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 26 Feb 2016 19:01:05 +0000 Subject: [PATCH 0541/3220] BACKPORT: tty: goldfish: support platform_device with id -1 When the platform bus sets the platform_device id to -1 (PLATFORM_DEVID_NONE), use an incrementing counter for the TTY index instead Signed-off-by: Greg Hackmann Signed-off-by: Jin Qian Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 465893e18878e119d8d0255439fad8debbd646fd) Change-Id: Ifec5ee9d71c7c076e59bb7af77c0184d1b1383cb --- drivers/tty/goldfish.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c index 0f82c0b146f6..4356a23c588f 100644 --- a/drivers/tty/goldfish.c +++ b/drivers/tty/goldfish.c @@ -68,8 +68,7 @@ static void goldfish_tty_do_write(int line, const char *buf, unsigned count) static irqreturn_t goldfish_tty_interrupt(int irq, void *dev_id) { - struct platform_device *pdev = dev_id; - struct goldfish_tty *qtty = &goldfish_ttys[pdev->id]; + struct goldfish_tty *qtty = dev_id; void __iomem *base = qtty->base; unsigned long irq_flags; unsigned char *buf; @@ -233,6 +232,7 @@ static int goldfish_tty_probe(struct platform_device *pdev) struct device *ttydev; void __iomem *base; u32 irq; + unsigned int line; r = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (r == NULL) @@ -248,10 +248,16 @@ static int goldfish_tty_probe(struct platform_device *pdev) irq = r->start; - if (pdev->id >= goldfish_tty_line_count) - goto err_unmap; - mutex_lock(&goldfish_tty_lock); + + if (pdev->id == PLATFORM_DEVID_NONE) + line = goldfish_tty_current_line_count; + else + line = pdev->id; + + if (line >= goldfish_tty_line_count) + goto err_create_driver_failed; + if (goldfish_tty_current_line_count == 0) { ret = goldfish_tty_create_driver(); if (ret) @@ -259,7 +265,7 @@ static int goldfish_tty_probe(struct platform_device *pdev) } goldfish_tty_current_line_count++; - qtty = &goldfish_ttys[pdev->id]; + qtty = &goldfish_ttys[line]; spin_lock_init(&qtty->lock); tty_port_init(&qtty->port); qtty->port.ops = &goldfish_port_ops; @@ -269,13 +275,13 @@ static int goldfish_tty_probe(struct platform_device *pdev) writel(GOLDFISH_TTY_CMD_INT_DISABLE, base + GOLDFISH_TTY_CMD); ret = request_irq(irq, goldfish_tty_interrupt, IRQF_SHARED, - "goldfish_tty", pdev); + "goldfish_tty", qtty); if (ret) goto err_request_irq_failed; ttydev = tty_port_register_device(&qtty->port, goldfish_tty_driver, - pdev->id, &pdev->dev); + line, &pdev->dev); if (IS_ERR(ttydev)) { ret = PTR_ERR(ttydev); goto err_tty_register_device_failed; @@ -286,8 +292,9 @@ static int goldfish_tty_probe(struct platform_device *pdev) qtty->console.device = goldfish_tty_console_device; qtty->console.setup = goldfish_tty_console_setup; qtty->console.flags = CON_PRINTBUFFER; - qtty->console.index = pdev->id; + qtty->console.index = line; register_console(&qtty->console); + platform_set_drvdata(pdev, qtty); mutex_unlock(&goldfish_tty_lock); return 0; @@ -307,13 +314,12 @@ err_unmap: static int goldfish_tty_remove(struct platform_device *pdev) { - struct goldfish_tty *qtty; + struct goldfish_tty *qtty = platform_get_drvdata(pdev); mutex_lock(&goldfish_tty_lock); - qtty = &goldfish_ttys[pdev->id]; unregister_console(&qtty->console); - tty_unregister_device(goldfish_tty_driver, pdev->id); + tty_unregister_device(goldfish_tty_driver, qtty->console.index); iounmap(qtty->base); qtty->base = NULL; free_irq(qtty->irq, pdev); From b58abfa8bd400855bf09c84ae27dd2a4446c60f9 Mon Sep 17 00:00:00 2001 From: Miodrag Dinic Date: Fri, 26 Feb 2016 19:00:44 +0000 Subject: [PATCH 0542/3220] BACKPORT: drivers: tty: goldfish: Add device tree bindings Enable support for registering this device using the device tree. Device tree node example for registering Goldfish TTY device : goldfish_tty@1f004000 { interrupts = <0xc>; reg = <0x1f004000 0x1000>; compatible = "google,goldfish-tty"; }; Signed-off-by: Miodrag Dinic Signed-off-by: Jin Qian Signed-off-by: Alan Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 9b883eea26ccf043b608e398cf6a26231d44f5fb) Change-Id: Idbe1bbac4f371e2feb6730712b08b66be1188ea7 --- .../devicetree/bindings/goldfish/tty.txt | 17 +++++++++++++++++ drivers/tty/goldfish.c | 10 +++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/goldfish/tty.txt diff --git a/Documentation/devicetree/bindings/goldfish/tty.txt b/Documentation/devicetree/bindings/goldfish/tty.txt new file mode 100644 index 000000000000..82648278da77 --- /dev/null +++ b/Documentation/devicetree/bindings/goldfish/tty.txt @@ -0,0 +1,17 @@ +Android Goldfish TTY + +Android goldfish tty device generated by android emulator. + +Required properties: + +- compatible : should contain "google,goldfish-tty" to match emulator +- reg : +- interrupts : + +Example: + + goldfish_tty@1f004000 { + compatible = "google,goldfish-tty"; + reg = <0x1f004000 0x1000>; + interrupts = <0xc>; + }; diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c index 4356a23c588f..1e332855b933 100644 --- a/drivers/tty/goldfish.c +++ b/drivers/tty/goldfish.c @@ -330,11 +330,19 @@ static int goldfish_tty_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id goldfish_tty_of_match[] = { + { .compatible = "google,goldfish-tty", }, + {}, +}; + +MODULE_DEVICE_TABLE(of, goldfish_tty_of_match); + static struct platform_driver goldfish_tty_platform_driver = { .probe = goldfish_tty_probe, .remove = goldfish_tty_remove, .driver = { - .name = "goldfish_tty" + .name = "goldfish_tty", + .of_match_table = goldfish_tty_of_match, } }; From 779a63ef878a4d8cd601e791fdf81df9fd5f953a Mon Sep 17 00:00:00 2001 From: Yu Ning Date: Tue, 1 Mar 2016 23:46:10 +0000 Subject: [PATCH 0543/3220] BACKPORT: goldfish: Enable ACPI-based enumeration for goldfish battery Add the ACPI bindings to the goldfish battery driver. Signed-off-by: Yu Ning Signed-off-by: Jin Qian Signed-off-by: Alan Cox Signed-off-by: Sebastian Reichel (cherry picked from commit fdb2f37a54470473c6b7c9d680c4c114dd9bc434) Change-Id: I3b53481b5868b0b26848397420c9ba16a747819f --- drivers/power/goldfish_battery.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/power/goldfish_battery.c b/drivers/power/goldfish_battery.c index 7510796e190c..f5c525e4482a 100644 --- a/drivers/power/goldfish_battery.c +++ b/drivers/power/goldfish_battery.c @@ -24,6 +24,7 @@ #include #include #include +#include struct goldfish_battery_data { void __iomem *reg_base; @@ -233,12 +234,19 @@ static const struct of_device_id goldfish_battery_of_match[] = { }; MODULE_DEVICE_TABLE(of, goldfish_battery_of_match); +static const struct acpi_device_id goldfish_battery_acpi_match[] = { + { "GFSH0001", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, goldfish_battery_acpi_match); + static struct platform_driver goldfish_battery_device = { .probe = goldfish_battery_probe, .remove = goldfish_battery_remove, .driver = { .name = "goldfish-battery", .of_match_table = goldfish_battery_of_match, + .acpi_match_table = ACPI_PTR(goldfish_battery_acpi_match), } }; module_platform_driver(goldfish_battery_device); From 212048596dbaa5e81386c9594782c4928ed6efa1 Mon Sep 17 00:00:00 2001 From: Jason Hu Date: Fri, 26 Feb 2016 12:06:47 -0800 Subject: [PATCH 0544/3220] BACKPORT: Input: goldfish_events - enable ACPI-based enumeration for goldfish events Add ACPI binding to the goldfish events driver. Signed-off-by: Jason Hu Signed-off-by: Jin Qian Signed-off-by: Alan Signed-off-by: Dmitry Torokhov (cherry picked from commit 0581ce09fd2c976125a20791268d7206db156d2f) Change-Id: Ic3e4f1cffb111ea6c69977e63dd598e3fcb55f19 --- drivers/input/keyboard/goldfish_events.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c index b11d218604a7..f6e643b589b6 100644 --- a/drivers/input/keyboard/goldfish_events.c +++ b/drivers/input/keyboard/goldfish_events.c @@ -22,6 +22,7 @@ #include #include #include +#include enum { REG_READ = 0x00, @@ -184,11 +185,20 @@ static const struct of_device_id goldfish_events_of_match[] = { }; MODULE_DEVICE_TABLE(of, goldfish_events_of_match); +#ifdef CONFIG_ACPI +static const struct acpi_device_id goldfish_events_acpi_match[] = { + { "GFSH0002", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, goldfish_events_acpi_match); +#endif + static struct platform_driver events_driver = { .probe = events_probe, .driver = { .name = "goldfish_events", .of_match_table = goldfish_events_of_match, + .acpi_match_table = ACPI_PTR(goldfish_events_acpi_match), }, }; From f3d949fd57efc8d8a868517e0f64b77662388417 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 26 Feb 2016 19:00:18 +0000 Subject: [PATCH 0545/3220] BACKPORT: staging: goldfish: audio: fix compiliation on arm We do actually need slab.h, by luck we get it on other platforms but not always on ARM. Include it properly. Signed-off-by: Greg Hackmann Signed-off-by: Jin Qian Signed-off-by: Alan Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 4532150762ceb0d6fd765ebcb3ba6966fbb8faab) Change-Id: I93a0c35da40f26aaa7c253e3c0cefaa883ea3391 --- drivers/staging/goldfish/goldfish_audio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c index 8841680ced9f..4191054b878e 100644 --- a/drivers/staging/goldfish/goldfish_audio.c +++ b/drivers/staging/goldfish/goldfish_audio.c @@ -26,6 +26,7 @@ #include #include #include +#include #include MODULE_AUTHOR("Google, Inc."); From 95f38dcf1d106e62e97f207344978d6230a9c292 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Mon, 28 Oct 2013 15:33:33 -0700 Subject: [PATCH 0546/3220] ANDROID: video: goldfishfb: add devicetree bindings Change-Id: I5f4ba861b981edf39af537001f8ac72202927031 Signed-off-by: Greg Hackmann --- drivers/video/fbdev/goldfishfb.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c index 7f6c9e6cfc6c..f0e651bbbd61 100644 --- a/drivers/video/fbdev/goldfishfb.c +++ b/drivers/video/fbdev/goldfishfb.c @@ -304,12 +304,19 @@ static int goldfish_fb_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id goldfish_fb_of_match[] = { + { .compatible = "google,goldfish-fb", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_fb_of_match); static struct platform_driver goldfish_fb_driver = { .probe = goldfish_fb_probe, .remove = goldfish_fb_remove, .driver = { - .name = "goldfish_fb" + .name = "goldfish_fb", + .owner = THIS_MODULE, + .of_match_table = goldfish_fb_of_match, } }; From 9cec6e315327318d99542f5ce695f62351cc2886 Mon Sep 17 00:00:00 2001 From: Yu Ning Date: Thu, 12 Feb 2015 11:44:40 +0800 Subject: [PATCH 0547/3220] ANDROID: goldfish: Enable ACPI-based enumeration for goldfish framebuffer Follow the same way in which ACPI was enabled for goldfish battery. See commit d3be10e for details. Note that this patch also depends on commit af33cac. Change-Id: Ic63b6e7e0a4b9896ef9a9d0ed135a7796a4c1fdb Signed-off-by: Yu Ning --- drivers/video/fbdev/goldfishfb.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c index f0e651bbbd61..58b33e4af16e 100644 --- a/drivers/video/fbdev/goldfishfb.c +++ b/drivers/video/fbdev/goldfishfb.c @@ -26,6 +26,7 @@ #include #include #include +#include enum { FB_GET_WIDTH = 0x00, @@ -310,6 +311,12 @@ static const struct of_device_id goldfish_fb_of_match[] = { }; MODULE_DEVICE_TABLE(of, goldfish_fb_of_match); +static const struct acpi_device_id goldfish_fb_acpi_match[] = { + { "GFSH0004", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, goldfish_fb_acpi_match); + static struct platform_driver goldfish_fb_driver = { .probe = goldfish_fb_probe, .remove = goldfish_fb_remove, @@ -317,6 +324,7 @@ static struct platform_driver goldfish_fb_driver = { .name = "goldfish_fb", .owner = THIS_MODULE, .of_match_table = goldfish_fb_of_match, + .acpi_match_table = ACPI_PTR(goldfish_fb_acpi_match), } }; From a666500dd7205043280ed38c2bca81854b4067be Mon Sep 17 00:00:00 2001 From: Yu Ning Date: Tue, 31 Mar 2015 14:41:48 +0800 Subject: [PATCH 0548/3220] ANDROID: goldfish: Enable ACPI-based enumeration for goldfish audio Follow the same way in which ACPI was enabled for goldfish battery. See commit d3be10e for details. Change-Id: I6ffe38ebc80fb8af8322152370b9d1fd227eaf50 Signed-off-by: Yu Ning --- drivers/staging/goldfish/goldfish_audio.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c index 4191054b878e..83bbe459b93a 100644 --- a/drivers/staging/goldfish/goldfish_audio.c +++ b/drivers/staging/goldfish/goldfish_audio.c @@ -28,6 +28,7 @@ #include #include #include +#include MODULE_AUTHOR("Google, Inc."); MODULE_DESCRIPTION("Android QEMU Audio Driver"); @@ -351,12 +352,19 @@ static const struct of_device_id goldfish_audio_of_match[] = { }; MODULE_DEVICE_TABLE(of, goldfish_audio_of_match); +static const struct acpi_device_id goldfish_audio_acpi_match[] = { + { "GFSH0005", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, goldfish_audio_acpi_match); + static struct platform_driver goldfish_audio_driver = { .probe = goldfish_audio_probe, .remove = goldfish_audio_remove, .driver = { .name = "goldfish_audio", .of_match_table = goldfish_audio_of_match, + .acpi_match_table = ACPI_PTR(goldfish_audio_acpi_match), } }; From d5a5ff91f852f26d7261cb31cb126f267567af24 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 19 Jun 2014 16:24:04 +0200 Subject: [PATCH 0549/3220] ANDROID: goldfish_fb: Set pixclock = 0 User space Android code identifies pixclock == 0 as a sign for emulation and will set the frame rate to 60 fps when reading this value, which is the desired outcome. Change-Id: I759bf518bf6683446bc786bf1be3cafa02dd8d42 Signed-off-by: Christoffer Dall Signed-off-by: Peter Maydell --- drivers/video/fbdev/goldfishfb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c index 58b33e4af16e..131fee0341c3 100644 --- a/drivers/video/fbdev/goldfishfb.c +++ b/drivers/video/fbdev/goldfishfb.c @@ -235,7 +235,7 @@ static int goldfish_fb_probe(struct platform_device *pdev) fb->fb.var.activate = FB_ACTIVATE_NOW; fb->fb.var.height = readl(fb->reg_base + FB_GET_PHYS_HEIGHT); fb->fb.var.width = readl(fb->reg_base + FB_GET_PHYS_WIDTH); - fb->fb.var.pixclock = 10000; + fb->fb.var.pixclock = 0; fb->fb.var.red.offset = 11; fb->fb.var.red.length = 5; From 8bf12bc1b78dac6cb4fb7e4fbc0920978d17f5ea Mon Sep 17 00:00:00 2001 From: Lingfeng Yang Date: Fri, 18 Dec 2015 12:04:43 -0800 Subject: [PATCH 0550/3220] ANDROID: goldfish_events: no extra EV_SYN; register goldfish If we send SYN_REPORT on every single multitouch event, it breaks the multitouch. The multitouch becomes janky and having to click 2-3 times to do stuff (plus randomly activating notification bars when not clicking) If we suppress these SYN_REPORTS, multitouch will work fine, plus the events will have a protocol that looks nice. In addition, we need to register Goldfish Events as a multitouch device by issuing input_mt_init_slots, otherwise input_handle_abs_event in drivers/input/input.c will silently drop all ABS_MT_SLOT events, making it so that touches with more than 1 finger do not work properly. Signed-off-by: "Lingfeng Yang" Change-Id: Ib2350f7d1732449d246f6f0d9b7b08f02cc7c2dd (cherry picked from commit 6cf40d0a16330e1ef42bdf07d9aba6c16ee11fbc) --- drivers/input/keyboard/goldfish_events.c | 28 +++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c index f6e643b589b6..c877e56a9bd5 100644 --- a/drivers/input/keyboard/goldfish_events.c +++ b/drivers/input/keyboard/goldfish_events.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,8 @@ #include #include +#define GOLDFISH_MAX_FINGERS 5 + enum { REG_READ = 0x00, REG_SET_PAGE = 0x00, @@ -52,7 +55,21 @@ static irqreturn_t events_interrupt(int irq, void *dev_id) value = __raw_readl(edev->addr + REG_READ); input_event(edev->input, type, code, value); - input_sync(edev->input); + // Send an extra (EV_SYN, SYN_REPORT, 0x0) event + // if a key was pressed. Some keyboard device + // drivers may only send the EV_KEY event and + // not EV_SYN. + // Note that sending an extra SYN_REPORT is not + // necessary nor correct protocol with other + // devices such as touchscreens, which will send + // their own SYN_REPORT's when sufficient event + // information has been collected (e.g., for + // touchscreens, when pressure and X/Y coordinates + // have been received). Hence, we will only send + // this extra SYN_REPORT if type == EV_KEY. + if (type == EV_KEY) { + input_sync(edev->input); + } return IRQ_HANDLED; } @@ -154,6 +171,15 @@ static int events_probe(struct platform_device *pdev) input_dev->name = edev->name; input_dev->id.bustype = BUS_HOST; + // Set the Goldfish Device to be multi-touch. + // In the Ranchu kernel, there is multi-touch-specific + // code for handling ABS_MT_SLOT events. + // See drivers/input/input.c:input_handle_abs_event. + // If we do not issue input_mt_init_slots, + // the kernel will filter out needed ABS_MT_SLOT + // events when we touch the screen in more than one place, + // preventing multi-touch with more than one finger from working. + input_mt_init_slots(input_dev, GOLDFISH_MAX_FINGERS, 0); events_import_bits(edev, input_dev->evbit, EV_SYN, EV_MAX); events_import_bits(edev, input_dev->keybit, EV_KEY, KEY_MAX); From 18e334986224af1dc9d1a61dd8957f07919c76d0 Mon Sep 17 00:00:00 2001 From: Joshua Lang Date: Fri, 17 Jun 2016 17:30:55 -0700 Subject: [PATCH 0551/3220] ANDROID: goldfish_audio: Clear audio read buffer status after each read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The buffer_status field is interrupt updated. After every read request, the buffer_status read field should be reset so that on the next loop iteration we don't read a stale value and read data before the device is ready. Signed-off-by: “Joshua Lang” Change-Id: I4943d5aaada1cad9c7e59a94a87c387578dabe86 --- drivers/staging/goldfish/goldfish_audio.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c index 83bbe459b93a..63b79c09b41b 100644 --- a/drivers/staging/goldfish/goldfish_audio.c +++ b/drivers/staging/goldfish/goldfish_audio.c @@ -117,6 +117,7 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf, size_t count, loff_t *pos) { struct goldfish_audio *data = fp->private_data; + unsigned long irq_flags; int length; int result = 0; @@ -130,6 +131,10 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf, wait_event_interruptible(data->wait, data->buffer_status & AUDIO_INT_READ_BUFFER_FULL); + spin_lock_irqsave(&data->lock, irq_flags); + data->buffer_status &= ~AUDIO_INT_READ_BUFFER_FULL; + spin_unlock_irqrestore(&data->lock, irq_flags); + length = AUDIO_READ(data, AUDIO_READ_BUFFER_AVAILABLE); /* copy data to user space */ From 0c1529aecf8fb719cf49de4d5a452b855e437387 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Fri, 7 Oct 2016 16:20:47 -0700 Subject: [PATCH 0552/3220] ANDROID: goldfish: add ranchu defconfigs Change-Id: I73ef1b132b6203ae921a1e1d4850eaadf58f8926 --- arch/arm/configs/ranchu_defconfig | 315 +++++++++++++++++ arch/arm64/configs/ranchu_defconfig | 311 +++++++++++++++++ arch/x86/configs/i386_ranchu_defconfig | 422 +++++++++++++++++++++++ arch/x86/configs/x86_64_ranchu_defconfig | 417 ++++++++++++++++++++++ 4 files changed, 1465 insertions(+) create mode 100644 arch/arm/configs/ranchu_defconfig create mode 100644 arch/arm64/configs/ranchu_defconfig create mode 100644 arch/x86/configs/i386_ranchu_defconfig create mode 100644 arch/x86/configs/x86_64_ranchu_defconfig diff --git a/arch/arm/configs/ranchu_defconfig b/arch/arm/configs/ranchu_defconfig new file mode 100644 index 000000000000..35a90af941a4 --- /dev/null +++ b/arch/arm/configs/ranchu_defconfig @@ -0,0 +1,315 @@ +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_SCHED=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_KALLSYMS_ALL=y +CONFIG_EMBEDDED=y +CONFIG_PROFILING=y +CONFIG_OPROFILE=y +CONFIG_ARCH_MMAP_RND_BITS=16 +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_IOSCHED_CFQ is not set +CONFIG_ARCH_VIRT=y +CONFIG_ARM_KERNMEM_PERMS=y +CONFIG_SMP=y +CONFIG_PREEMPT=y +CONFIG_AEABI=y +CONFIG_HIGHMEM=y +CONFIG_KSM=y +CONFIG_SECCOMP=y +CONFIG_CMDLINE="console=ttyAMA0" +CONFIG_VFP=y +CONFIG_NEON=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +# CONFIG_PM_WAKELOCKS_GC is not set +CONFIG_PM_DEBUG=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=y +CONFIG_NET_KEY=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_INET_ESP=y +# CONFIG_INET_LRO is not set +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_QTAGUID=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_BRIDGE=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_CLS_ACT=y +# CONFIG_WIRELESS is not set +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_MTD=y +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_BLOCK=y +CONFIG_MTD_CFI=y +CONFIG_MTD_CFI_INTELEXT=y +CONFIG_MTD_CFI_AMDSTD=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_VIRTIO_BLK=y +CONFIG_MD=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_CRYPT=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_NETDEVICES=y +CONFIG_TUN=y +CONFIG_VIRTIO_NET=y +CONFIG_SMSC911X=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PPPOLAC=y +CONFIG_PPPOPNS=y +CONFIG_USB_USBNET=y +# CONFIG_WLAN is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_KEYRESET=y +CONFIG_KEYBOARD_GOLDFISH_EVENTS=y +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=y +CONFIG_TABLET_USB_AIPTEK=y +CONFIG_TABLET_USB_GTCO=y +CONFIG_TABLET_USB_HANWANG=y +CONFIG_TABLET_USB_KBTAB=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_KEYCHORD=y +CONFIG_INPUT_UINPUT=y +CONFIG_INPUT_GPIO=y +# CONFIG_SERIO_SERPORT is not set +CONFIG_SERIO_AMBAKMI=y +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=y +# CONFIG_HW_RANDOM is not set +# CONFIG_HWMON is not set +CONFIG_MEDIA_SUPPORT=y +CONFIG_FB=y +CONFIG_FB_GOLDFISH=y +CONFIG_FB_SIMPLE=y +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EZKEY=y +CONFIG_HID_HOLTEK=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_WALTOP=y +CONFIG_HID_GYRATION=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SONY=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_GREENASIA=y +CONFIG_GREENASIA_FF=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_WACOM=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_OTG_WAKELOCK=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_PL031=y +CONFIG_VIRTIO_MMIO=y +CONFIG_STAGING=y +CONFIG_ASHMEM=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_SYNC=y +CONFIG_SW_SYNC=y +CONFIG_SW_SYNC_USER=y +CONFIG_ION=y +CONFIG_GOLDFISH_AUDIO=y +CONFIG_GOLDFISH=y +CONFIG_GOLDFISH_PIPE=y +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_QUOTA=y +CONFIG_FUSE_FS=y +CONFIG_CUSE=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y +CONFIG_NFS_FS=y +CONFIG_ROOT_NFS=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ISO8859_1=y +CONFIG_DEBUG_INFO=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_PANIC_TIMEOUT=5 +# CONFIG_SCHED_DEBUG is not set +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +CONFIG_ENABLE_DEFAULT_TRACERS=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_VIRTUALIZATION=y diff --git a/arch/arm64/configs/ranchu_defconfig b/arch/arm64/configs/ranchu_defconfig new file mode 100644 index 000000000000..00eb346e0928 --- /dev/null +++ b/arch/arm64/configs/ranchu_defconfig @@ -0,0 +1,311 @@ +# CONFIG_LOCALVERSION_AUTO is not set +# CONFIG_SWAP is not set +CONFIG_POSIX_MQUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_SCHED_AUTOGROUP=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_KALLSYMS_ALL=y +CONFIG_EMBEDDED=y +# CONFIG_COMPAT_BRK is not set +CONFIG_PROFILING=y +CONFIG_ARCH_MMAP_RND_BITS=24 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_IOSCHED_DEADLINE is not set +CONFIG_ARCH_VEXPRESS=y +CONFIG_NR_CPUS=4 +CONFIG_PREEMPT=y +CONFIG_KSM=y +CONFIG_SECCOMP=y +CONFIG_ARMV8_DEPRECATED=y +CONFIG_SWP_EMULATION=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_SETEND_EMULATION=y +CONFIG_CMDLINE="console=ttyAMA0" +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_COMPAT=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +# CONFIG_PM_WAKELOCKS_GC is not set +CONFIG_PM_DEBUG=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=y +CONFIG_NET_KEY=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_INET_ESP=y +# CONFIG_INET_LRO is not set +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_QTAGUID=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_RPFILTER=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_TARGET_ECN=y +CONFIG_IP_NF_TARGET_TTL=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_MATCH_AH=y +CONFIG_IP6_NF_MATCH_EUI64=y +CONFIG_IP6_NF_MATCH_FRAG=y +CONFIG_IP6_NF_MATCH_OPTS=y +CONFIG_IP6_NF_MATCH_HL=y +CONFIG_IP6_NF_MATCH_IPV6HEADER=y +CONFIG_IP6_NF_MATCH_MH=y +CONFIG_IP6_NF_MATCH_RT=y +CONFIG_IP6_NF_TARGET_HL=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_BRIDGE=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_CLS_ACT=y +# CONFIG_WIRELESS is not set +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_VIRTIO_BLK=y +CONFIG_SCSI=y +# CONFIG_SCSI_PROC_FS is not set +CONFIG_BLK_DEV_SD=y +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_MD=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_CRYPT=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_NETDEVICES=y +CONFIG_TUN=y +CONFIG_VIRTIO_NET=y +CONFIG_SMC91X=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PPPOLAC=y +CONFIG_PPPOPNS=y +# CONFIG_WLAN is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_KEYRESET=y +CONFIG_KEYBOARD_GOLDFISH_EVENTS=y +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_INPUT_TABLET=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_KEYCHORD=y +CONFIG_INPUT_UINPUT=y +CONFIG_INPUT_GPIO=y +# CONFIG_SERIO_SERPORT is not set +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=y +# CONFIG_HW_RANDOM is not set +CONFIG_BATTERY_GOLDFISH=y +# CONFIG_HWMON is not set +CONFIG_MEDIA_SUPPORT=y +CONFIG_FB=y +CONFIG_FB_GOLDFISH=y +CONFIG_FB_SIMPLE=y +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EZKEY=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_WALTOP=y +CONFIG_HID_GYRATION=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_GREENASIA=y +CONFIG_GREENASIA_FF=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_WACOM=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +# CONFIG_USB_SUPPORT is not set +CONFIG_RTC_CLASS=y +CONFIG_VIRTIO_MMIO=y +CONFIG_STAGING=y +CONFIG_ASHMEM=y +CONFIG_ANDROID_TIMED_GPIO=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_SYNC=y +CONFIG_SW_SYNC=y +CONFIG_SW_SYNC_USER=y +CONFIG_ION=y +CONFIG_GOLDFISH_AUDIO=y +CONFIG_GOLDFISH=y +CONFIG_GOLDFISH_PIPE=y +# CONFIG_IOMMU_SUPPORT is not set +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_EXT2_FS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_QUOTA=y +CONFIG_FUSE_FS=y +CONFIG_CUSE=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +# CONFIG_MISC_FILESYSTEMS is not set +CONFIG_NFS_FS=y +CONFIG_ROOT_NFS=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ISO8859_1=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_FS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_PANIC_TIMEOUT=5 +# CONFIG_SCHED_DEBUG is not set +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +# CONFIG_FTRACE is not set +CONFIG_ATOMIC64_SELFTEST=y +CONFIG_DEBUG_RODATA=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig new file mode 100644 index 000000000000..b0e4e0ed4b11 --- /dev/null +++ b/arch/x86/configs/i386_ranchu_defconfig @@ -0,0 +1,422 @@ +# CONFIG_64BIT is not set +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_POSIX_MQUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_SCHED=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_SYSCTL_SYSCALL=y +CONFIG_KALLSYMS_ALL=y +CONFIG_EMBEDDED=y +# CONFIG_COMPAT_BRK is not set +CONFIG_ARCH_MMAP_RND_BITS=16 +CONFIG_PARTITION_ADVANCED=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_SGI_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_SMP=y +CONFIG_X86_BIGSMP=y +CONFIG_MCORE2=y +CONFIG_X86_GENERIC=y +CONFIG_HPET_TIMER=y +CONFIG_NR_CPUS=512 +CONFIG_PREEMPT=y +# CONFIG_X86_MCE is not set +CONFIG_X86_REBOOTFIXUPS=y +CONFIG_X86_MSR=y +CONFIG_X86_CPUID=y +CONFIG_KSM=y +CONFIG_CMA=y +# CONFIG_MTRR_SANITIZER is not set +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_HZ_100=y +CONFIG_PHYSICAL_START=0x100000 +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +# CONFIG_PM_WAKELOCKS_GC is not set +CONFIG_PM_DEBUG=y +CONFIG_CPU_FREQ=y +# CONFIG_CPU_FREQ_STAT is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_PCIEPORTBUS=y +# CONFIG_PCIEASPM is not set +CONFIG_PCCARD=y +CONFIG_YENTA=y +CONFIG_HOTPLUG_PCI=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_BINFMT_MISC=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=y +CONFIG_NET_KEY=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_INET_ESP=y +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set +# CONFIG_INET_DIAG is not set +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NETLABEL=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_QTAGUID=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_CLS_ACT=y +CONFIG_CFG80211=y +CONFIG_MAC80211=y +CONFIG_MAC80211_LEDS=y +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=16 +CONFIG_CONNECTOR=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_SPI_ATTRS=y +CONFIG_SCSI_ISCSI_ATTRS=y +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_ATA_PIIX=y +CONFIG_PATA_AMD=y +CONFIG_PATA_OLDPIIX=y +CONFIG_PATA_SCH=y +CONFIG_PATA_MPIIX=y +CONFIG_ATA_GENERIC=y +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_DEBUG=y +CONFIG_DM_CRYPT=y +CONFIG_DM_MIRROR=y +CONFIG_DM_ZERO=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_NETDEVICES=y +CONFIG_NETCONSOLE=y +CONFIG_TUN=y +CONFIG_VIRTIO_NET=y +CONFIG_BNX2=y +CONFIG_TIGON3=y +CONFIG_NET_TULIP=y +CONFIG_E100=y +CONFIG_E1000=y +CONFIG_E1000E=y +CONFIG_SKY2=y +CONFIG_NE2K_PCI=y +CONFIG_FORCEDETH=y +CONFIG_8139TOO=y +# CONFIG_8139TOO_PIO is not set +CONFIG_R8169=y +CONFIG_FDDI=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PPPOLAC=y +CONFIG_PPPOPNS=y +CONFIG_USB_USBNET=y +CONFIG_INPUT_POLLDEV=y +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_KEYRESET=y +# CONFIG_KEYBOARD_ATKBD is not set +CONFIG_KEYBOARD_GOLDFISH_EVENTS=y +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=y +CONFIG_TABLET_USB_AIPTEK=y +CONFIG_TABLET_USB_GTCO=y +CONFIG_TABLET_USB_HANWANG=y +CONFIG_TABLET_USB_KBTAB=y +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_KEYCHORD=y +CONFIG_INPUT_UINPUT=y +CONFIG_INPUT_GPIO=y +# CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +CONFIG_SERIAL_NONSTANDARD=y +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_NVRAM=y +CONFIG_I2C_I801=y +CONFIG_BATTERY_GOLDFISH=y +CONFIG_WATCHDOG=y +CONFIG_MEDIA_SUPPORT=y +CONFIG_AGP=y +CONFIG_AGP_AMD64=y +CONFIG_AGP_INTEL=y +CONFIG_DRM=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y +CONFIG_FB_EFI=y +CONFIG_FB_GOLDFISH=y +CONFIG_BACKLIGHT_LCD_SUPPORT=y +# CONFIG_LCD_CLASS_DEVICE is not set +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EZKEY=y +CONFIG_HID_HOLTEK=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_WALTOP=y +CONFIG_HID_GYRATION=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SONY=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_GREENASIA=y +CONFIG_GREENASIA_FF=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_WACOM=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_MON=y +CONFIG_USB_EHCI_HCD=y +# CONFIG_USB_EHCI_TT_NEWSCHED is not set +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_UHCI_HCD=y +CONFIG_USB_PRINTER=y +CONFIG_USB_STORAGE=y +CONFIG_USB_OTG_WAKELOCK=y +CONFIG_EDAC=y +CONFIG_RTC_CLASS=y +# CONFIG_RTC_HCTOSYS is not set +CONFIG_DMADEVICES=y +CONFIG_VIRTIO_PCI=y +CONFIG_STAGING=y +CONFIG_ASHMEM=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_SYNC=y +CONFIG_SW_SYNC=y +CONFIG_ION=y +CONFIG_GOLDFISH_AUDIO=y +CONFIG_SND_HDA_INTEL=y +CONFIG_GOLDFISH=y +CONFIG_GOLDFISH_PIPE=y +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +CONFIG_FUSE_FS=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_HUGETLBFS=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y +# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_UTF8=y +CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_INFO=y +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_PANIC_TIMEOUT=5 +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +CONFIG_SCHED_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_PROVIDE_OHCI1394_DMA_INIT=y +CONFIG_KEYS=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_CRYPTO_AES_586=y +CONFIG_CRYPTO_TWOFISH=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_PKCS7_TEST_KEY=y +# CONFIG_VIRTUALIZATION is not set +CONFIG_CRC_T10DIF=y diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig new file mode 100644 index 000000000000..8dae21ed3ede --- /dev/null +++ b/arch/x86/configs/x86_64_ranchu_defconfig @@ -0,0 +1,417 @@ +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_POSIX_MQUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_SCHED=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_SYSCTL_SYSCALL=y +CONFIG_KALLSYMS_ALL=y +CONFIG_EMBEDDED=y +# CONFIG_COMPAT_BRK is not set +CONFIG_ARCH_MMAP_RND_BITS=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 +CONFIG_PARTITION_ADVANCED=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_SGI_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_SMP=y +CONFIG_MCORE2=y +CONFIG_MAXSMP=y +CONFIG_PREEMPT=y +# CONFIG_X86_MCE is not set +CONFIG_X86_MSR=y +CONFIG_X86_CPUID=y +CONFIG_KSM=y +CONFIG_CMA=y +# CONFIG_MTRR_SANITIZER is not set +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_HZ_100=y +CONFIG_PHYSICAL_START=0x100000 +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +# CONFIG_PM_WAKELOCKS_GC is not set +CONFIG_PM_DEBUG=y +CONFIG_CPU_FREQ=y +# CONFIG_CPU_FREQ_STAT is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCIEPORTBUS=y +# CONFIG_PCIEASPM is not set +CONFIG_PCCARD=y +CONFIG_YENTA=y +CONFIG_HOTPLUG_PCI=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_BINFMT_MISC=y +CONFIG_IA32_EMULATION=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=y +CONFIG_NET_KEY=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_INET_ESP=y +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set +# CONFIG_INET_DIAG is not set +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NETLABEL=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_QTAGUID=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_QUOTA2=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_CLS_ACT=y +CONFIG_CFG80211=y +CONFIG_MAC80211=y +CONFIG_MAC80211_LEDS=y +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DMA_CMA=y +CONFIG_CONNECTOR=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_SPI_ATTRS=y +CONFIG_SCSI_ISCSI_ATTRS=y +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_ATA_PIIX=y +CONFIG_PATA_AMD=y +CONFIG_PATA_OLDPIIX=y +CONFIG_PATA_SCH=y +CONFIG_PATA_MPIIX=y +CONFIG_ATA_GENERIC=y +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_BLK_DEV_DM=y +CONFIG_DM_DEBUG=y +CONFIG_DM_CRYPT=y +CONFIG_DM_MIRROR=y +CONFIG_DM_ZERO=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_NETDEVICES=y +CONFIG_NETCONSOLE=y +CONFIG_TUN=y +CONFIG_VIRTIO_NET=y +CONFIG_BNX2=y +CONFIG_TIGON3=y +CONFIG_NET_TULIP=y +CONFIG_E100=y +CONFIG_E1000=y +CONFIG_E1000E=y +CONFIG_SKY2=y +CONFIG_NE2K_PCI=y +CONFIG_FORCEDETH=y +CONFIG_8139TOO=y +# CONFIG_8139TOO_PIO is not set +CONFIG_R8169=y +CONFIG_FDDI=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PPPOLAC=y +CONFIG_PPPOPNS=y +CONFIG_USB_USBNET=y +CONFIG_INPUT_POLLDEV=y +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_KEYRESET=y +# CONFIG_KEYBOARD_ATKBD is not set +CONFIG_KEYBOARD_GOLDFISH_EVENTS=y +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=y +CONFIG_TABLET_USB_AIPTEK=y +CONFIG_TABLET_USB_GTCO=y +CONFIG_TABLET_USB_HANWANG=y +CONFIG_TABLET_USB_KBTAB=y +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_KEYCHORD=y +CONFIG_INPUT_UINPUT=y +CONFIG_INPUT_GPIO=y +# CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set +CONFIG_SERIAL_NONSTANDARD=y +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_NVRAM=y +CONFIG_I2C_I801=y +CONFIG_BATTERY_GOLDFISH=y +CONFIG_WATCHDOG=y +CONFIG_MEDIA_SUPPORT=y +CONFIG_AGP=y +CONFIG_AGP_AMD64=y +CONFIG_AGP_INTEL=y +CONFIG_DRM=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y +CONFIG_FB_EFI=y +CONFIG_FB_GOLDFISH=y +CONFIG_BACKLIGHT_LCD_SUPPORT=y +# CONFIG_LCD_CLASS_DEVICE is not set +CONFIG_SOUND=y +CONFIG_SND=y +CONFIG_HIDRAW=y +CONFIG_UHID=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EZKEY=y +CONFIG_HID_HOLTEK=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_WALTOP=y +CONFIG_HID_GYRATION=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SONY=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_GREENASIA=y +CONFIG_GREENASIA_FF=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_WACOM=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_MON=y +CONFIG_USB_EHCI_HCD=y +# CONFIG_USB_EHCI_TT_NEWSCHED is not set +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_UHCI_HCD=y +CONFIG_USB_PRINTER=y +CONFIG_USB_STORAGE=y +CONFIG_USB_OTG_WAKELOCK=y +CONFIG_EDAC=y +CONFIG_RTC_CLASS=y +# CONFIG_RTC_HCTOSYS is not set +CONFIG_DMADEVICES=y +CONFIG_VIRTIO_PCI=y +CONFIG_STAGING=y +CONFIG_ASHMEM=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_SYNC=y +CONFIG_SW_SYNC=y +CONFIG_ION=y +CONFIG_GOLDFISH_AUDIO=y +CONFIG_SND_HDA_INTEL=y +CONFIG_GOLDFISH=y +CONFIG_GOLDFISH_PIPE=y +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +CONFIG_FUSE_FS=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_HUGETLBFS=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y +# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_UTF8=y +CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_INFO=y +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_ENABLE_MUST_CHECK is not set +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_PANIC_TIMEOUT=5 +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +CONFIG_SCHED_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_PROVIDE_OHCI1394_DMA_INIT=y +CONFIG_KEYS=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_CRYPTO_TWOFISH=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_PKCS7_TEST_KEY=y +# CONFIG_VIRTUALIZATION is not set +CONFIG_CRC_T10DIF=y From 2c8ebe3687301b2a94bb551d6d1a6570e2b407d2 Mon Sep 17 00:00:00 2001 From: Lingfeng Yang Date: Mon, 13 Jun 2016 09:24:07 -0700 Subject: [PATCH 0553/3220] ANDROID: goldfish: Add goldfish sync driver This is kernel driver for controlling the Goldfish sync device on the host. It is used to maintain ordering in critical OpenGL state changes while using GPU emulation. The guest open()'s the Goldfish sync device to create a context for possibly maintaining sync timeline and fences. There is a 1:1 correspondence between such sync contexts and OpenGL contexts in the guest that need synchronization (which in turn, is anything involving swapping buffers, SurfaceFlinger, or Hardware Composer). The ioctl QUEUE_WORK takes a handle to a sync object and attempts to tell the host GPU to wait on the sync object and deal with signaling it. It possibly outputs a fence FD on which the Android systems that use them (GLConsumer, SurfaceFlinger, anything employing EGL_ANDROID_native_fence_sync) can use to wait. Design decisions and work log: - New approach is to have the guest issue ioctls that trigger host wait, and then host increments timeline. - We need the host's sync object handle and sync thread handle as the necessary information for that. - ioctl() from guest can work simultaneously with the interrupt handling for commands from host. - optimization: don't write back on timeline inc - Change spin lock design to be much more lightweight; do not call sw_sync functions or loop too long anywhere. - Send read/write commands in batches to minimize guest/host transitions. - robustness: BUG if we will overrun the cmd buffer. - robustness: return fd -1 if we cannot get an unused fd. - correctness: remove global mutex - cleanup pass done, incl. but not limited to: - removal of clear_upto and - switching to devm_*** This is part of a sequential, multi-CL change: external/qemu: https://android-review.googlesource.com/239442 <- host-side device's host interface https://android-review.googlesource.com/221593 https://android-review.googlesource.com/248563 https://android-review.googlesource.com/248564 https://android-review.googlesource.com/223032 external/qemu-android: https://android-review.googlesource.com/238790 <- host-side device implementation kernel/goldfish: https://android-review.googlesource.com/232631 <- needed https://android-review.googlesource.com/238399 <- this CL Also squash following bug fixes from android-goldfish-3.18 branch. b44d486 goldfish_sync: provide a signal to detect reboot ad1f597 goldfish_sync: fix stalls by avoiding early kfree() de208e8 [goldfish-sync] Fix possible race between kernel and user space Change-Id: I22f8a0e824717a7e751b1b0e1b461455501502b6 --- arch/x86/configs/i386_ranchu_defconfig | 1 + arch/x86/configs/x86_64_ranchu_defconfig | 1 + drivers/staging/goldfish/Kconfig | 6 + drivers/staging/goldfish/Makefile | 5 + drivers/staging/goldfish/goldfish_sync.c | 987 +++++++++++++++++++++++ 5 files changed, 1000 insertions(+) create mode 100644 drivers/staging/goldfish/goldfish_sync.c diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig index b0e4e0ed4b11..0206eb8cfb61 100644 --- a/arch/x86/configs/i386_ranchu_defconfig +++ b/arch/x86/configs/i386_ranchu_defconfig @@ -363,6 +363,7 @@ CONFIG_SYNC=y CONFIG_SW_SYNC=y CONFIG_ION=y CONFIG_GOLDFISH_AUDIO=y +CONFIG_GOLDFISH_SYNC=y CONFIG_SND_HDA_INTEL=y CONFIG_GOLDFISH=y CONFIG_GOLDFISH_PIPE=y diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig index 8dae21ed3ede..dd389774bacb 100644 --- a/arch/x86/configs/x86_64_ranchu_defconfig +++ b/arch/x86/configs/x86_64_ranchu_defconfig @@ -360,6 +360,7 @@ CONFIG_SYNC=y CONFIG_SW_SYNC=y CONFIG_ION=y CONFIG_GOLDFISH_AUDIO=y +CONFIG_GOLDFISH_SYNC=y CONFIG_SND_HDA_INTEL=y CONFIG_GOLDFISH=y CONFIG_GOLDFISH_PIPE=y diff --git a/drivers/staging/goldfish/Kconfig b/drivers/staging/goldfish/Kconfig index 4e094602437c..c579141a7bed 100644 --- a/drivers/staging/goldfish/Kconfig +++ b/drivers/staging/goldfish/Kconfig @@ -4,6 +4,12 @@ config GOLDFISH_AUDIO ---help--- Emulated audio channel for the Goldfish Android Virtual Device +config GOLDFISH_SYNC + tristate "Goldfish AVD Sync Driver" + depends on GOLDFISH + ---help--- + Emulated sync fences for the Goldfish Android Virtual Device + config MTD_GOLDFISH_NAND tristate "Goldfish NAND device" depends on GOLDFISH diff --git a/drivers/staging/goldfish/Makefile b/drivers/staging/goldfish/Makefile index dec34ad58162..0cf525588210 100644 --- a/drivers/staging/goldfish/Makefile +++ b/drivers/staging/goldfish/Makefile @@ -4,3 +4,8 @@ obj-$(CONFIG_GOLDFISH_AUDIO) += goldfish_audio.o obj-$(CONFIG_MTD_GOLDFISH_NAND) += goldfish_nand.o + +# and sync + +ccflags-y := -Idrivers/staging/android +obj-$(CONFIG_GOLDFISH_SYNC) += goldfish_sync.o diff --git a/drivers/staging/goldfish/goldfish_sync.c b/drivers/staging/goldfish/goldfish_sync.c new file mode 100644 index 000000000000..ba8def29901e --- /dev/null +++ b/drivers/staging/goldfish/goldfish_sync.c @@ -0,0 +1,987 @@ +/* + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "sw_sync.h" +#include "sync.h" + +#define ERR(...) printk(KERN_ERR __VA_ARGS__); + +#define INFO(...) printk(KERN_INFO __VA_ARGS__); + +#define DPRINT(...) pr_debug(__VA_ARGS__); + +#define DTRACE() DPRINT("%s: enter", __func__) + +/* The Goldfish sync driver is designed to provide a interface + * between the underlying host's sync device and the kernel's + * sw_sync. + * The purpose of the device/driver is to enable lightweight + * creation and signaling of timelines and fences + * in order to synchronize the guest with host-side graphics events. + * + * Each time the interrupt trips, the driver + * may perform a sw_sync operation. + */ + +/* The operations are: */ + +/* Ready signal - used to mark when irq should lower */ +#define CMD_SYNC_READY 0 + +/* Create a new timeline. writes timeline handle */ +#define CMD_CREATE_SYNC_TIMELINE 1 + +/* Create a fence object. reads timeline handle and time argument. + * Writes fence fd to the SYNC_REG_HANDLE register. */ +#define CMD_CREATE_SYNC_FENCE 2 + +/* Increments timeline. reads timeline handle and time argument */ +#define CMD_SYNC_TIMELINE_INC 3 + +/* Destroys a timeline. reads timeline handle */ +#define CMD_DESTROY_SYNC_TIMELINE 4 + +/* Starts a wait on the host with + * the given glsync object and sync thread handle. */ +#define CMD_TRIGGER_HOST_WAIT 5 + +/* The register layout is: */ + +#define SYNC_REG_BATCH_COMMAND 0x00 /* host->guest batch commands */ +#define SYNC_REG_BATCH_GUESTCOMMAND 0x04 /* guest->host batch commands */ +#define SYNC_REG_BATCH_COMMAND_ADDR 0x08 /* communicate physical address of host->guest batch commands */ +#define SYNC_REG_BATCH_COMMAND_ADDR_HIGH 0x0c /* 64-bit part */ +#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR 0x10 /* communicate physical address of guest->host commands */ +#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH 0x14 /* 64-bit part */ +#define SYNC_REG_INIT 0x18 /* signals that the device has been probed */ + +/* There is an ioctl associated with goldfish sync driver. + * Make it conflict with ioctls that are not likely to be used + * in the emulator. + * + * '@' 00-0F linux/radeonfb.h conflict! + * '@' 00-0F drivers/video/aty/aty128fb.c conflict! + */ +#define GOLDFISH_SYNC_IOC_MAGIC '@' + +#define GOLDFISH_SYNC_IOC_QUEUE_WORK _IOWR(GOLDFISH_SYNC_IOC_MAGIC, 0, struct goldfish_sync_ioctl_info) + +/* The above definitions (command codes, register layout, ioctl definitions) + * need to be in sync with the following files: + * + * Host-side (emulator): + * external/qemu/android/emulation/goldfish_sync.h + * external/qemu-android/hw/misc/goldfish_sync.c + * + * Guest-side (system image): + * device/generic/goldfish-opengl/system/egl/goldfish_sync.h + * device/generic/goldfish/ueventd.ranchu.rc + * platform/build/target/board/generic/sepolicy/file_contexts + */ +struct goldfish_sync_hostcmd { + /* sorted for alignment */ + uint64_t handle; + uint64_t hostcmd_handle; + uint32_t cmd; + uint32_t time_arg; +}; + +struct goldfish_sync_guestcmd { + uint64_t host_command; /* uint64_t for alignment */ + uint64_t glsync_handle; + uint64_t thread_handle; + uint64_t guest_timeline_handle; +}; + +#define GOLDFISH_SYNC_MAX_CMDS 64 + +struct goldfish_sync_state { + char __iomem *reg_base; + int irq; + + /* Spinlock protects |to_do| / |to_do_end|. */ + spinlock_t lock; + /* |mutex_lock| protects all concurrent access + * to timelines for both kernel and user space. */ + struct mutex mutex_lock; + + /* Buffer holding commands issued from host. */ + struct goldfish_sync_hostcmd to_do[GOLDFISH_SYNC_MAX_CMDS]; + uint32_t to_do_end; + + /* Addresses for the reading or writing + * of individual commands. The host can directly write + * to |batch_hostcmd| (and then this driver immediately + * copies contents to |to_do|). This driver either replies + * through |batch_hostcmd| or simply issues a + * guest->host command through |batch_guestcmd|. + */ + struct goldfish_sync_hostcmd *batch_hostcmd; + struct goldfish_sync_guestcmd *batch_guestcmd; + + /* Used to give this struct itself to a work queue + * function for executing actual sync commands. */ + struct work_struct work_item; +}; + +static struct goldfish_sync_state global_sync_state[1]; + +struct goldfish_sync_timeline_obj { + struct sw_sync_timeline *sw_sync_tl; + uint32_t current_time; + /* We need to be careful about when we deallocate + * this |goldfish_sync_timeline_obj| struct. + * In order to ensure proper cleanup, we need to + * consider the triggered host-side wait that may + * still be in flight when the guest close()'s a + * goldfish_sync device's sync context fd (and + * destroys the |sw_sync_tl| field above). + * The host-side wait may raise IRQ + * and tell the kernel to increment the timeline _after_ + * the |sw_sync_tl| has already been set to null. + * + * From observations on OpenGL apps and CTS tests, this + * happens at some very low probability upon context + * destruction or process close, but it does happen + * and it needs to be handled properly. Otherwise, + * if we clean up the surrounding |goldfish_sync_timeline_obj| + * too early, any |handle| field of any host->guest command + * might not even point to a null |sw_sync_tl| field, + * but to garbage memory or even a reclaimed |sw_sync_tl|. + * If we do not count such "pending waits" and kfree the object + * immediately upon |goldfish_sync_timeline_destroy|, + * we might get mysterous RCU stalls after running a long + * time because the garbage memory that is being read + * happens to be interpretable as a |spinlock_t| struct + * that is currently in the locked state. + * + * To track when to free the |goldfish_sync_timeline_obj| + * itself, we maintain a kref. + * The kref essentially counts the timeline itself plus + * the number of waits in flight. kref_init/kref_put + * are issued on + * |goldfish_sync_timeline_create|/|goldfish_sync_timeline_destroy| + * and kref_get/kref_put are issued on + * |goldfish_sync_fence_create|/|goldfish_sync_timeline_inc|. + * + * The timeline is destroyed after reference count + * reaches zero, which would happen after + * |goldfish_sync_timeline_destroy| and all pending + * |goldfish_sync_timeline_inc|'s are fulfilled. + * + * NOTE (1): We assume that |fence_create| and + * |timeline_inc| calls are 1:1, otherwise the kref scheme + * will not work. This is a valid assumption as long + * as the host-side virtual device implementation + * does not insert any timeline increments + * that we did not trigger from here. + * + * NOTE (2): The use of kref by itself requires no locks, + * but this does not mean everything works without locks. + * Related timeline operations do require a lock of some sort, + * or at least are not proven to work without it. + * In particualr, we assume that all the operations + * done on the |kref| field above are done in contexts where + * |global_sync_state->mutex_lock| is held. Do not + * remove that lock until everything is proven to work + * without it!!! */ + struct kref kref; +}; + +/* We will call |delete_timeline_obj| when the last reference count + * of the kref is decremented. This deletes the sw_sync + * timeline object along with the wrapper itself. */ +static void delete_timeline_obj(struct kref* kref) { + struct goldfish_sync_timeline_obj* obj = + container_of(kref, struct goldfish_sync_timeline_obj, kref); + + sync_timeline_destroy(&obj->sw_sync_tl->obj); + obj->sw_sync_tl = NULL; + kfree(obj); +} + +static uint64_t gensym_ctr; +static void gensym(char *dst) +{ + sprintf(dst, "goldfish_sync:gensym:%llu", gensym_ctr); + gensym_ctr++; +} + +/* |goldfish_sync_timeline_create| assumes that |global_sync_state->mutex_lock| + * is held. */ +static struct goldfish_sync_timeline_obj* +goldfish_sync_timeline_create(void) +{ + + char timeline_name[256]; + struct sw_sync_timeline *res_sync_tl = NULL; + struct goldfish_sync_timeline_obj *res; + + DTRACE(); + + gensym(timeline_name); + + res_sync_tl = sw_sync_timeline_create(timeline_name); + if (!res_sync_tl) { + ERR("Failed to create sw_sync timeline."); + return NULL; + } + + res = kzalloc(sizeof(struct goldfish_sync_timeline_obj), GFP_KERNEL); + res->sw_sync_tl = res_sync_tl; + res->current_time = 0; + kref_init(&res->kref); + + DPRINT("new timeline_obj=0x%p", res); + return res; +} + +/* |goldfish_sync_fence_create| assumes that |global_sync_state->mutex_lock| + * is held. */ +static int +goldfish_sync_fence_create(struct goldfish_sync_timeline_obj *obj, + uint32_t val) +{ + + int fd; + char fence_name[256]; + struct sync_pt *syncpt = NULL; + struct sync_fence *sync_obj = NULL; + struct sw_sync_timeline *tl; + + DTRACE(); + + if (!obj) return -1; + + tl = obj->sw_sync_tl; + + syncpt = sw_sync_pt_create(tl, val); + if (!syncpt) { + ERR("could not create sync point! " + "sync_timeline=0x%p val=%d", + tl, val); + return -1; + } + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + ERR("could not get unused fd for sync fence. " + "errno=%d", fd); + goto err_cleanup_pt; + } + + gensym(fence_name); + + sync_obj = sync_fence_create(fence_name, syncpt); + if (!sync_obj) { + ERR("could not create sync fence! " + "sync_timeline=0x%p val=%d sync_pt=0x%p", + tl, val, syncpt); + goto err_cleanup_fd_pt; + } + + DPRINT("installing sync fence into fd %d sync_obj=0x%p", fd, sync_obj); + sync_fence_install(sync_obj, fd); + kref_get(&obj->kref); + + return fd; + +err_cleanup_fd_pt: + put_unused_fd(fd); +err_cleanup_pt: + sync_pt_free(syncpt); + return -1; +} + +/* |goldfish_sync_timeline_inc| assumes that |global_sync_state->mutex_lock| + * is held. */ +static void +goldfish_sync_timeline_inc(struct goldfish_sync_timeline_obj *obj, uint32_t inc) +{ + DTRACE(); + /* Just give up if someone else nuked the timeline. + * Whoever it was won't care that it doesn't get signaled. */ + if (!obj) return; + + DPRINT("timeline_obj=0x%p", obj); + sw_sync_timeline_inc(obj->sw_sync_tl, inc); + DPRINT("incremented timeline. increment max_time"); + obj->current_time += inc; + + /* Here, we will end up deleting the timeline object if it + * turns out that this call was a pending increment after + * |goldfish_sync_timeline_destroy| was called. */ + kref_put(&obj->kref, delete_timeline_obj); + DPRINT("done"); +} + +/* |goldfish_sync_timeline_destroy| assumes + * that |global_sync_state->mutex_lock| is held. */ +static void +goldfish_sync_timeline_destroy(struct goldfish_sync_timeline_obj *obj) +{ + DTRACE(); + /* See description of |goldfish_sync_timeline_obj| for why we + * should not immediately destroy |obj| */ + kref_put(&obj->kref, delete_timeline_obj); +} + +static inline void +goldfish_sync_cmd_queue(struct goldfish_sync_state *sync_state, + uint32_t cmd, + uint64_t handle, + uint32_t time_arg, + uint64_t hostcmd_handle) +{ + struct goldfish_sync_hostcmd *to_add; + + DTRACE(); + + BUG_ON(sync_state->to_do_end == GOLDFISH_SYNC_MAX_CMDS); + + to_add = &sync_state->to_do[sync_state->to_do_end]; + + to_add->cmd = cmd; + to_add->handle = handle; + to_add->time_arg = time_arg; + to_add->hostcmd_handle = hostcmd_handle; + + sync_state->to_do_end += 1; +} + +static inline void +goldfish_sync_hostcmd_reply(struct goldfish_sync_state *sync_state, + uint32_t cmd, + uint64_t handle, + uint32_t time_arg, + uint64_t hostcmd_handle) +{ + unsigned long irq_flags; + struct goldfish_sync_hostcmd *batch_hostcmd = + sync_state->batch_hostcmd; + + DTRACE(); + + spin_lock_irqsave(&sync_state->lock, irq_flags); + + batch_hostcmd->cmd = cmd; + batch_hostcmd->handle = handle; + batch_hostcmd->time_arg = time_arg; + batch_hostcmd->hostcmd_handle = hostcmd_handle; + writel(0, sync_state->reg_base + SYNC_REG_BATCH_COMMAND); + + spin_unlock_irqrestore(&sync_state->lock, irq_flags); +} + +static inline void +goldfish_sync_send_guestcmd(struct goldfish_sync_state *sync_state, + uint32_t cmd, + uint64_t glsync_handle, + uint64_t thread_handle, + uint64_t timeline_handle) +{ + unsigned long irq_flags; + struct goldfish_sync_guestcmd *batch_guestcmd = + sync_state->batch_guestcmd; + + DTRACE(); + + spin_lock_irqsave(&sync_state->lock, irq_flags); + + batch_guestcmd->host_command = (uint64_t)cmd; + batch_guestcmd->glsync_handle = (uint64_t)glsync_handle; + batch_guestcmd->thread_handle = (uint64_t)thread_handle; + batch_guestcmd->guest_timeline_handle = (uint64_t)timeline_handle; + writel(0, sync_state->reg_base + SYNC_REG_BATCH_GUESTCOMMAND); + + spin_unlock_irqrestore(&sync_state->lock, irq_flags); +} + +/* |goldfish_sync_interrupt| handles IRQ raises from the virtual device. + * In the context of OpenGL, this interrupt will fire whenever we need + * to signal a fence fd in the guest, with the command + * |CMD_SYNC_TIMELINE_INC|. + * However, because this function will be called in an interrupt context, + * it is necessary to do the actual work of signaling off of interrupt context. + * The shared work queue is used for this purpose. At the end when + * all pending commands are intercepted by the interrupt handler, + * we call |schedule_work|, which will later run the actual + * desired sync command in |goldfish_sync_work_item_fn|. + */ +static irqreturn_t goldfish_sync_interrupt(int irq, void *dev_id) +{ + + struct goldfish_sync_state *sync_state = dev_id; + + uint32_t nextcmd; + uint32_t command_r; + uint64_t handle_rw; + uint32_t time_r; + uint64_t hostcmd_handle_rw; + + int count = 0; + + DTRACE(); + + sync_state = dev_id; + + spin_lock(&sync_state->lock); + + for (;;) { + + readl(sync_state->reg_base + SYNC_REG_BATCH_COMMAND); + nextcmd = sync_state->batch_hostcmd->cmd; + + if (nextcmd == 0) + break; + + command_r = nextcmd; + handle_rw = sync_state->batch_hostcmd->handle; + time_r = sync_state->batch_hostcmd->time_arg; + hostcmd_handle_rw = sync_state->batch_hostcmd->hostcmd_handle; + + goldfish_sync_cmd_queue( + sync_state, + command_r, + handle_rw, + time_r, + hostcmd_handle_rw); + + count++; + } + + spin_unlock(&sync_state->lock); + + schedule_work(&sync_state->work_item); + + return (count == 0) ? IRQ_NONE : IRQ_HANDLED; +} + +/* |goldfish_sync_work_item_fn| does the actual work of servicing + * host->guest sync commands. This function is triggered whenever + * the IRQ for the goldfish sync device is raised. Once it starts + * running, it grabs the contents of the buffer containing the + * commands it needs to execute (there may be multiple, because + * our IRQ is active high and not edge triggered), and then + * runs all of them one after the other. + */ +static void goldfish_sync_work_item_fn(struct work_struct *input) +{ + + struct goldfish_sync_state *sync_state; + int sync_fence_fd; + + struct goldfish_sync_timeline_obj *timeline; + uint64_t timeline_ptr; + + uint64_t hostcmd_handle; + + uint32_t cmd; + uint64_t handle; + uint32_t time_arg; + + struct goldfish_sync_hostcmd *todo; + uint32_t todo_end; + + unsigned long irq_flags; + + struct goldfish_sync_hostcmd to_run[GOLDFISH_SYNC_MAX_CMDS]; + uint32_t i = 0; + + sync_state = container_of(input, struct goldfish_sync_state, work_item); + + mutex_lock(&sync_state->mutex_lock); + + spin_lock_irqsave(&sync_state->lock, irq_flags); { + + todo_end = sync_state->to_do_end; + + DPRINT("num sync todos: %u", sync_state->to_do_end); + + for (i = 0; i < todo_end; i++) + to_run[i] = sync_state->to_do[i]; + + /* We expect that commands will come in at a slow enough rate + * so that incoming items will not be more than + * GOLDFISH_SYNC_MAX_CMDS. + * + * This is because the way the sync device is used, + * it's only for managing buffer data transfers per frame, + * with a sequential dependency between putting things in + * to_do and taking them out. Once a set of commands is + * queued up in to_do, the user of the device waits for + * them to be processed before queuing additional commands, + * which limits the rate at which commands come in + * to the rate at which we take them out here. + * + * We also don't expect more than MAX_CMDS to be issued + * at once; there is a correspondence between + * which buffers need swapping to the (display / buffer queue) + * to particular commands, and we don't expect there to be + * enough display or buffer queues in operation at once + * to overrun GOLDFISH_SYNC_MAX_CMDS. + */ + sync_state->to_do_end = 0; + + } spin_unlock_irqrestore(&sync_state->lock, irq_flags); + + for (i = 0; i < todo_end; i++) { + DPRINT("todo index: %u", i); + + todo = &to_run[i]; + + cmd = todo->cmd; + + handle = (uint64_t)todo->handle; + time_arg = todo->time_arg; + hostcmd_handle = (uint64_t)todo->hostcmd_handle; + + DTRACE(); + + timeline = (struct goldfish_sync_timeline_obj *)(uintptr_t)handle; + + switch (cmd) { + case CMD_SYNC_READY: + break; + case CMD_CREATE_SYNC_TIMELINE: + DPRINT("exec CMD_CREATE_SYNC_TIMELINE: " + "handle=0x%llx time_arg=%d", + handle, time_arg); + timeline = goldfish_sync_timeline_create(); + timeline_ptr = (uintptr_t)timeline; + goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_TIMELINE, + timeline_ptr, + 0, + hostcmd_handle); + DPRINT("sync timeline created: %p", timeline); + break; + case CMD_CREATE_SYNC_FENCE: + DPRINT("exec CMD_CREATE_SYNC_FENCE: " + "handle=0x%llx time_arg=%d", + handle, time_arg); + sync_fence_fd = goldfish_sync_fence_create(timeline, time_arg); + goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_FENCE, + sync_fence_fd, + 0, + hostcmd_handle); + break; + case CMD_SYNC_TIMELINE_INC: + DPRINT("exec CMD_SYNC_TIMELINE_INC: " + "handle=0x%llx time_arg=%d", + handle, time_arg); + goldfish_sync_timeline_inc(timeline, time_arg); + break; + case CMD_DESTROY_SYNC_TIMELINE: + DPRINT("exec CMD_DESTROY_SYNC_TIMELINE: " + "handle=0x%llx time_arg=%d", + handle, time_arg); + goldfish_sync_timeline_destroy(timeline); + break; + } + DPRINT("Done executing sync command"); + } + mutex_unlock(&sync_state->mutex_lock); +} + +/* Guest-side interface: file operations */ + +/* Goldfish sync context and ioctl info. + * + * When a sync context is created by open()-ing the goldfish sync device, we + * create a sync context (|goldfish_sync_context|). + * + * Currently, the only data required to track is the sync timeline itself + * along with the current time, which are all packed up in the + * |goldfish_sync_timeline_obj| field. We use a |goldfish_sync_context| + * as the filp->private_data. + * + * Next, when a sync context user requests that work be queued and a fence + * fd provided, we use the |goldfish_sync_ioctl_info| struct, which holds + * information about which host handles to touch for this particular + * queue-work operation. We need to know about the host-side sync thread + * and the particular host-side GLsync object. We also possibly write out + * a file descriptor. + */ +struct goldfish_sync_context { + struct goldfish_sync_timeline_obj *timeline; +}; + +struct goldfish_sync_ioctl_info { + uint64_t host_glsync_handle_in; + uint64_t host_syncthread_handle_in; + int fence_fd_out; +}; + +static int goldfish_sync_open(struct inode *inode, struct file *file) +{ + + struct goldfish_sync_context *sync_context; + + DTRACE(); + + mutex_lock(&global_sync_state->mutex_lock); + + sync_context = kzalloc(sizeof(struct goldfish_sync_context), GFP_KERNEL); + + if (sync_context == NULL) { + ERR("Creation of goldfish sync context failed!"); + mutex_unlock(&global_sync_state->mutex_lock); + return -ENOMEM; + } + + sync_context->timeline = NULL; + + file->private_data = sync_context; + + DPRINT("successfully create a sync context @0x%p", sync_context); + + mutex_unlock(&global_sync_state->mutex_lock); + + return 0; +} + +static int goldfish_sync_release(struct inode *inode, struct file *file) +{ + + struct goldfish_sync_context *sync_context; + + DTRACE(); + + mutex_lock(&global_sync_state->mutex_lock); + + sync_context = file->private_data; + + if (sync_context->timeline) + goldfish_sync_timeline_destroy(sync_context->timeline); + + sync_context->timeline = NULL; + + kfree(sync_context); + + mutex_unlock(&global_sync_state->mutex_lock); + + return 0; +} + +/* |goldfish_sync_ioctl| is the guest-facing interface of goldfish sync + * and is used in conjunction with eglCreateSyncKHR to queue up the + * actual work of waiting for the EGL sync command to complete, + * possibly returning a fence fd to the guest. + */ +static long goldfish_sync_ioctl(struct file *file, + unsigned int cmd, + unsigned long arg) +{ + struct goldfish_sync_context *sync_context_data; + struct goldfish_sync_timeline_obj *timeline; + int fd_out; + struct goldfish_sync_ioctl_info ioctl_data; + + DTRACE(); + + sync_context_data = file->private_data; + fd_out = -1; + + switch (cmd) { + case GOLDFISH_SYNC_IOC_QUEUE_WORK: + + DPRINT("exec GOLDFISH_SYNC_IOC_QUEUE_WORK"); + + mutex_lock(&global_sync_state->mutex_lock); + + if (copy_from_user(&ioctl_data, + (void __user *)arg, + sizeof(ioctl_data))) { + ERR("Failed to copy memory for ioctl_data from user."); + mutex_unlock(&global_sync_state->mutex_lock); + return -EFAULT; + } + + if (ioctl_data.host_syncthread_handle_in == 0) { + DPRINT("Error: zero host syncthread handle!!!"); + mutex_unlock(&global_sync_state->mutex_lock); + return -EFAULT; + } + + if (!sync_context_data->timeline) { + DPRINT("no timeline yet, create one."); + sync_context_data->timeline = goldfish_sync_timeline_create(); + DPRINT("timeline: 0x%p", &sync_context_data->timeline); + } + + timeline = sync_context_data->timeline; + fd_out = goldfish_sync_fence_create(timeline, + timeline->current_time + 1); + DPRINT("Created fence with fd %d and current time %u (timeline: 0x%p)", + fd_out, + sync_context_data->timeline->current_time + 1, + sync_context_data->timeline); + + ioctl_data.fence_fd_out = fd_out; + + if (copy_to_user((void __user *)arg, + &ioctl_data, + sizeof(ioctl_data))) { + DPRINT("Error, could not copy to user!!!"); + + sys_close(fd_out); + /* We won't be doing an increment, kref_put immediately. */ + kref_put(&timeline->kref, delete_timeline_obj); + mutex_unlock(&global_sync_state->mutex_lock); + return -EFAULT; + } + + /* We are now about to trigger a host-side wait; + * accumulate on |pending_waits|. */ + goldfish_sync_send_guestcmd(global_sync_state, + CMD_TRIGGER_HOST_WAIT, + ioctl_data.host_glsync_handle_in, + ioctl_data.host_syncthread_handle_in, + (uint64_t)(uintptr_t)(sync_context_data->timeline)); + + mutex_unlock(&global_sync_state->mutex_lock); + return 0; + default: + return -ENOTTY; + } +} + +static const struct file_operations goldfish_sync_fops = { + .owner = THIS_MODULE, + .open = goldfish_sync_open, + .release = goldfish_sync_release, + .unlocked_ioctl = goldfish_sync_ioctl, + .compat_ioctl = goldfish_sync_ioctl, +}; + +static struct miscdevice goldfish_sync_device = { + .name = "goldfish_sync", + .fops = &goldfish_sync_fops, +}; + + +static bool setup_verify_batch_cmd_addr(struct goldfish_sync_state *sync_state, + void *batch_addr, + uint32_t addr_offset, + uint32_t addr_offset_high) +{ + uint64_t batch_addr_phys; + uint32_t batch_addr_phys_test_lo; + uint32_t batch_addr_phys_test_hi; + + if (!batch_addr) { + ERR("Could not use batch command address!"); + return false; + } + + batch_addr_phys = virt_to_phys(batch_addr); + writel((uint32_t)(batch_addr_phys), + sync_state->reg_base + addr_offset); + writel((uint32_t)(batch_addr_phys >> 32), + sync_state->reg_base + addr_offset_high); + + batch_addr_phys_test_lo = + readl(sync_state->reg_base + addr_offset); + batch_addr_phys_test_hi = + readl(sync_state->reg_base + addr_offset_high); + + if (virt_to_phys(batch_addr) != + (((uint64_t)batch_addr_phys_test_hi << 32) | + batch_addr_phys_test_lo)) { + ERR("Invalid batch command address!"); + return false; + } + + return true; +} + +int goldfish_sync_probe(struct platform_device *pdev) +{ + struct resource *ioresource; + struct goldfish_sync_state *sync_state = global_sync_state; + int status; + + DTRACE(); + + sync_state->to_do_end = 0; + + spin_lock_init(&sync_state->lock); + mutex_init(&sync_state->mutex_lock); + + platform_set_drvdata(pdev, sync_state); + + ioresource = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (ioresource == NULL) { + ERR("platform_get_resource failed"); + return -ENODEV; + } + + sync_state->reg_base = devm_ioremap(&pdev->dev, ioresource->start, PAGE_SIZE); + if (sync_state->reg_base == NULL) { + ERR("Could not ioremap"); + return -ENOMEM; + } + + sync_state->irq = platform_get_irq(pdev, 0); + if (sync_state->irq < 0) { + ERR("Could not platform_get_irq"); + return -ENODEV; + } + + status = devm_request_irq(&pdev->dev, + sync_state->irq, + goldfish_sync_interrupt, + IRQF_SHARED, + pdev->name, + sync_state); + if (status) { + ERR("request_irq failed"); + return -ENODEV; + } + + INIT_WORK(&sync_state->work_item, + goldfish_sync_work_item_fn); + + misc_register(&goldfish_sync_device); + + /* Obtain addresses for batch send/recv of commands. */ + { + struct goldfish_sync_hostcmd *batch_addr_hostcmd; + struct goldfish_sync_guestcmd *batch_addr_guestcmd; + + batch_addr_hostcmd = devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_hostcmd), + GFP_KERNEL); + batch_addr_guestcmd = devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_guestcmd), + GFP_KERNEL); + + if (!setup_verify_batch_cmd_addr(sync_state, + batch_addr_hostcmd, + SYNC_REG_BATCH_COMMAND_ADDR, + SYNC_REG_BATCH_COMMAND_ADDR_HIGH)) { + ERR("goldfish_sync: Could not setup batch command address"); + return -ENODEV; + } + + if (!setup_verify_batch_cmd_addr(sync_state, + batch_addr_guestcmd, + SYNC_REG_BATCH_GUESTCOMMAND_ADDR, + SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH)) { + ERR("goldfish_sync: Could not setup batch guest command address"); + return -ENODEV; + } + + sync_state->batch_hostcmd = batch_addr_hostcmd; + sync_state->batch_guestcmd = batch_addr_guestcmd; + } + + INFO("goldfish_sync: Initialized goldfish sync device"); + + writel(0, sync_state->reg_base + SYNC_REG_INIT); + + return 0; +} + +static int goldfish_sync_remove(struct platform_device *pdev) +{ + struct goldfish_sync_state *sync_state = global_sync_state; + + DTRACE(); + + misc_deregister(&goldfish_sync_device); + memset(sync_state, 0, sizeof(struct goldfish_sync_state)); + return 0; +} + +static const struct of_device_id goldfish_sync_of_match[] = { + { .compatible = "google,goldfish-sync", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_sync_of_match); + +static const struct acpi_device_id goldfish_sync_acpi_match[] = { + { "GFSH0006", 0 }, + { }, +}; + +MODULE_DEVICE_TABLE(acpi, goldfish_sync_acpi_match); + +static struct platform_driver goldfish_sync = { + .probe = goldfish_sync_probe, + .remove = goldfish_sync_remove, + .driver = { + .name = "goldfish_sync", + .of_match_table = goldfish_sync_of_match, + .acpi_match_table = ACPI_PTR(goldfish_sync_acpi_match), + } +}; + +module_platform_driver(goldfish_sync); + +MODULE_AUTHOR("Google, Inc."); +MODULE_DESCRIPTION("Android QEMU Sync Driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); + +/* This function is only to run a basic test of sync framework. + * It creates a timeline and fence object whose signal point is at 1. + * The timeline is incremented, and we use the sync framework's + * sync_fence_wait on that fence object. If everything works out, + * we should not hang in the wait and return immediately. + * There is no way to explicitly run this test yet, but it + * can be used by inserting it at the end of goldfish_sync_probe. + */ +void test_kernel_sync(void) +{ + struct goldfish_sync_timeline_obj *test_timeline; + int test_fence_fd; + + DTRACE(); + + DPRINT("test sw_sync"); + + test_timeline = goldfish_sync_timeline_create(); + DPRINT("sw_sync_timeline_create -> 0x%p", test_timeline); + + test_fence_fd = goldfish_sync_fence_create(test_timeline, 1); + DPRINT("sync_fence_create -> %d", test_fence_fd); + + DPRINT("incrementing test timeline"); + goldfish_sync_timeline_inc(test_timeline, 1); + + DPRINT("test waiting (should NOT hang)"); + sync_fence_wait( + sync_fence_fdget(test_fence_fd), -1); + + DPRINT("test waiting (afterward)"); +} From e96f35d6073717452d9bc484dad0c3cfb61f5c46 Mon Sep 17 00:00:00 2001 From: Yurii Zubrytskyi Date: Wed, 4 May 2016 13:05:38 -0700 Subject: [PATCH 0554/3220] ANDROID: goldfish_pipe: bugfixes and performance improvements. Combine following patches from android-goldfish-3.18 branch: c0f015a [pipe] Fix the pipe driver for x64 platform + correct pages count 48e6bf5 [pipe] Use get_use_pages_fast() which is possibly faster fb20f13 [goldfish] More pages in goldfish pipe f180e6d goldfish_pipe: Return from read_write on signal and EIO 3dec3b7 [pipe] Fix a minor leak in setup_access_params_addr() Change-Id: I1041fd65d7faaec123e6cedd3dbbc5a2fbb86c4d --- drivers/platform/goldfish/goldfish_pipe.c | 123 ++++++++++++++-------- 1 file changed, 77 insertions(+), 46 deletions(-) diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 3215a33cf4fe..55929ed58ea3 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -109,6 +109,16 @@ #define PIPE_WAKE_READ (1 << 1) /* pipe can now be read from */ #define PIPE_WAKE_WRITE (1 << 2) /* pipe can now be written to */ +#define MAX_PAGES_TO_GRAB 32 + +#define DEBUG 0 + +#if DEBUG +#define DPRINT(...) { printk(KERN_ERR __VA_ARGS__); } +#else +#define DPRINT(...) +#endif + struct access_params { unsigned long channel; u32 size; @@ -231,8 +241,10 @@ static int setup_access_params_addr(struct platform_device *pdev, if (valid_batchbuffer_addr(dev, aps)) { dev->aps = aps; return 0; - } else + } else { + devm_kfree(&pdev->dev, aps); return -1; + } } /* A value that will not be set by qemu emulator */ @@ -269,6 +281,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, struct goldfish_pipe *pipe = filp->private_data; struct goldfish_pipe_dev *dev = pipe->dev; unsigned long address, address_end; + struct page* pages[MAX_PAGES_TO_GRAB] = {}; int count = 0, ret = -EINVAL; /* If the emulator already closed the pipe, no need to go further */ @@ -292,45 +305,59 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, address_end = address + bufflen; while (address < address_end) { - unsigned long page_end = (address & PAGE_MASK) + PAGE_SIZE; - unsigned long next = page_end < address_end ? page_end - : address_end; - unsigned long avail = next - address; - int status, wakeBit; - - struct page *page; - - /* Either vaddr or paddr depending on the device version */ - unsigned long xaddr; + unsigned long page_end = (address & PAGE_MASK) + PAGE_SIZE; + unsigned long next, avail; + int status, wakeBit, page_i, num_contiguous_pages; + long first_page, last_page, requested_pages; + unsigned long xaddr, xaddr_prev, xaddr_i; /* - * We grab the pages on a page-by-page basis in case user - * space gives us a potentially huge buffer but the read only - * returns a small amount, then there's no need to pin that - * much memory to the process. + * Attempt to grab multiple physically contiguous pages. */ - down_read(¤t->mm->mmap_sem); - ret = get_user_pages(current, current->mm, address, 1, - !is_write, 0, &page, NULL); - up_read(¤t->mm->mmap_sem); - if (ret < 0) - return ret; - - if (dev->version) { - /* Device version 1 or newer (qemu-android) expects the - * physical address. */ - xaddr = page_to_phys(page) | (address & ~PAGE_MASK); - } else { - /* Device version 0 (classic emulator) expects the - * virtual address. */ - xaddr = address; + first_page = address & PAGE_MASK; + last_page = (address_end - 1) & PAGE_MASK; + requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1; + if (requested_pages > MAX_PAGES_TO_GRAB) { + requested_pages = MAX_PAGES_TO_GRAB; } + ret = get_user_pages_fast(first_page, requested_pages, + !is_write, pages); + + DPRINT("%s: requested pages: %d %d\n", __FUNCTION__, ret, requested_pages); + if (ret == 0) { + DPRINT("%s: error: (requested pages == 0) (wanted %d)\n", + __FUNCTION__, requested_pages); + return ret; + } + if (ret < 0) { + DPRINT("%s: (requested pages < 0) %d \n", + __FUNCTION__, requested_pages); + return ret; + } + + xaddr = page_to_phys(pages[0]) | (address & ~PAGE_MASK); + xaddr_prev = xaddr; + num_contiguous_pages = ret == 0 ? 0 : 1; + for (page_i = 1; page_i < ret; page_i++) { + xaddr_i = page_to_phys(pages[page_i]) | (address & ~PAGE_MASK); + if (xaddr_i == xaddr_prev + PAGE_SIZE) { + page_end += PAGE_SIZE; + xaddr_prev = xaddr_i; + num_contiguous_pages++; + } else { + DPRINT("%s: discontinuous page boundary: %d pages instead\n", + __FUNCTION__, page_i); + break; + } + } + next = page_end < address_end ? page_end : address_end; + avail = next - address; /* Now, try to transfer the bytes in the current page */ spin_lock_irqsave(&dev->lock, irq_flags); if (access_with_param(dev, - is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER, - xaddr, avail, pipe, &status)) { + is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER, + xaddr, avail, pipe, &status)) { gf_write_ptr(pipe, dev->base + PIPE_REG_CHANNEL, dev->base + PIPE_REG_CHANNEL_HIGH); writel(avail, dev->base + PIPE_REG_SIZE); @@ -343,9 +370,13 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, } spin_unlock_irqrestore(&dev->lock, irq_flags); - if (status > 0 && !is_write) - set_page_dirty(page); - put_page(page); + for (page_i = 0; page_i < ret; page_i++) { + if (status > 0 && !is_write && + page_i < num_contiguous_pages) { + set_page_dirty(pages[page_i]); + } + put_page(pages[page_i]); + } if (status > 0) { /* Correct transfer */ count += status; @@ -367,7 +398,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, */ if (status != PIPE_ERROR_AGAIN) pr_info_ratelimited("goldfish_pipe: backend returned error %d on %s\n", - status, is_write ? "write" : "read"); + status, is_write ? "write" : "read"); ret = 0; break; } @@ -377,7 +408,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, * non-blocking mode, just return the error code. */ if (status != PIPE_ERROR_AGAIN || - (filp->f_flags & O_NONBLOCK) != 0) { + (filp->f_flags & O_NONBLOCK) != 0) { ret = goldfish_pipe_error_convert(status); break; } @@ -391,7 +422,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, /* Tell the emulator we're going to wait for a wake event */ goldfish_cmd(pipe, - is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ); + is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ); /* Unlock the pipe, then wait for the wake signal */ mutex_unlock(&pipe->lock); @@ -399,15 +430,11 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, while (test_bit(wakeBit, &pipe->flags)) { if (wait_event_interruptible( pipe->wake_queue, - !test_bit(wakeBit, &pipe->flags))) { - ret = -ERESTARTSYS; - break; - } + !test_bit(wakeBit, &pipe->flags))) + return -ERESTARTSYS; - if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)) { - ret = -EIO; - break; - } + if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)) + return -EIO; } /* Try to re-acquire the lock */ @@ -543,6 +570,8 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file) pipe->dev = dev; mutex_init(&pipe->lock); + DPRINT("%s: call. pipe_dev pipe_dev=0x%lx new_pipe_addr=0x%lx file=0x%lx\n", __FUNCTION__, pipe_dev, pipe, file); + // spin lock init, write head of list, i guess init_waitqueue_head(&pipe->wake_queue); /* @@ -565,6 +594,7 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp) { struct goldfish_pipe *pipe = filp->private_data; + DPRINT("%s: call. pipe=0x%lx file=0x%lx\n", __FUNCTION__, pipe, filp); /* The guest is closing the channel, so tell the emulator right now */ goldfish_cmd(pipe, CMD_CLOSE); kfree(pipe); @@ -589,6 +619,7 @@ static struct miscdevice goldfish_pipe_device = { static int goldfish_pipe_probe(struct platform_device *pdev) { + DPRINT("%s: call. platform_device=0x%lx\n", __FUNCTION__, pdev); int err; struct resource *r; struct goldfish_pipe_dev *dev = pipe_dev; From bc43565e1ac5ba3f204886a2275726bb4c3d44e6 Mon Sep 17 00:00:00 2001 From: Yurii Zubrytskyi Date: Fri, 29 Jul 2016 10:51:46 -0700 Subject: [PATCH 0555/3220] ANDROID: goldfish_pipe: An implementation of more parallel pipe This is a driver code for a redesigned android pipe. Currently it works for x86 and x64 emulators with the following performance results: ADB push to /dev/null, Ubuntu, 400 MB file, times are for 1/10/100 parallel adb commands x86 adb push: (4.4s / 11.5s / 2m10s) -> (2.8s / 6s / 51s) x64 adb push: (7s / 15s / (too long, 6m+) -> (2.7s / 6.2s / 52s) ADB pull and push to /data/ have the same %% of speedup More importantly, I don't see any signs of slowdowns when run in parallel with Antutu benchmark, so it is definitely making much better job at multithreading. The code features dynamic host detection: old emulator gets the previous version of the pipe driver code. Combine follow patch from android-goldfish-3.10 b543285 [pipe] Increase the default pipe buffers size, make it configurable Signed-off-by: "Yurii Zubrytskyi" Change-Id: I140d506204cab6e78dd503e5a43abc8886e4ffff --- drivers/platform/goldfish/Makefile | 2 +- drivers/platform/goldfish/goldfish_pipe.c | 168 +--- drivers/platform/goldfish/goldfish_pipe.h | 91 ++ drivers/platform/goldfish/goldfish_pipe_v2.c | 888 +++++++++++++++++++ 4 files changed, 1005 insertions(+), 144 deletions(-) create mode 100644 drivers/platform/goldfish/goldfish_pipe.h create mode 100644 drivers/platform/goldfish/goldfish_pipe_v2.c diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile index d3487125838c..e53ae2fc717b 100644 --- a/drivers/platform/goldfish/Makefile +++ b/drivers/platform/goldfish/Makefile @@ -2,4 +2,4 @@ # Makefile for Goldfish platform specific drivers # obj-$(CONFIG_GOLDFISH_BUS) += pdev_bus.o -obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe.o +obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe.o goldfish_pipe_v2.o diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 55929ed58ea3..cf7ce97e7346 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -15,51 +15,11 @@ * */ -/* This source file contains the implementation of a special device driver - * that intends to provide a *very* fast communication channel between the - * guest system and the QEMU emulator. - * - * Usage from the guest is simply the following (error handling simplified): - * - * int fd = open("/dev/qemu_pipe",O_RDWR); - * .... write() or read() through the pipe. - * - * This driver doesn't deal with the exact protocol used during the session. - * It is intended to be as simple as something like: - * - * // do this _just_ after opening the fd to connect to a specific - * // emulator service. - * const char* msg = ""; - * if (write(fd, msg, strlen(msg)+1) < 0) { - * ... could not connect to service - * close(fd); - * } - * - * // after this, simply read() and write() to communicate with the - * // service. Exact protocol details left as an exercise to the reader. - * - * This driver is very fast because it doesn't copy any data through - * intermediate buffers, since the emulator is capable of translating - * guest user addresses into host ones. - * - * Note that we must however ensure that each user page involved in the - * exchange is properly mapped during a transfer. +/* This source file contains the implementation of the legacy version of + * a goldfish pipe device driver. See goldfish_pipe_v2.c for the current + * version. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "goldfish_pipe.h" /* * IMPORTANT: The following constants must match the ones used and defined @@ -119,6 +79,14 @@ #define DPRINT(...) #endif +/* This data type models a given pipe instance */ +struct goldfish_pipe { + struct goldfish_pipe_dev *dev; + struct mutex lock; + unsigned long flags; + wait_queue_head_t wake_queue; +}; + struct access_params { unsigned long channel; u32 size; @@ -129,29 +97,6 @@ struct access_params { u32 flags; }; -/* The global driver data. Holds a reference to the i/o page used to - * communicate with the emulator, and a wake queue for blocked tasks - * waiting to be awoken. - */ -struct goldfish_pipe_dev { - spinlock_t lock; - unsigned char __iomem *base; - struct access_params *aps; - int irq; - u32 version; -}; - -static struct goldfish_pipe_dev pipe_dev[1]; - -/* This data type models a given pipe instance */ -struct goldfish_pipe { - struct goldfish_pipe_dev *dev; - struct mutex lock; - unsigned long flags; - wait_queue_head_t wake_queue; -}; - - /* Bit flags for the 'flags' field */ enum { BIT_CLOSED_ON_HOST = 0, /* pipe closed by host */ @@ -323,7 +268,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, ret = get_user_pages_fast(first_page, requested_pages, !is_write, pages); - DPRINT("%s: requested pages: %d %d\n", __FUNCTION__, ret, requested_pages); + DPRINT("%s: requested pages: %d %d %p\n", __FUNCTION__, + ret, requested_pages, first_page); if (ret == 0) { DPRINT("%s: error: (requested pages == 0) (wanted %d)\n", __FUNCTION__, requested_pages); @@ -611,97 +557,33 @@ static const struct file_operations goldfish_pipe_fops = { .release = goldfish_pipe_release, }; -static struct miscdevice goldfish_pipe_device = { +static struct miscdevice goldfish_pipe_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "goldfish_pipe", .fops = &goldfish_pipe_fops, }; -static int goldfish_pipe_probe(struct platform_device *pdev) +int goldfish_pipe_device_init_v1(struct platform_device *pdev) { - DPRINT("%s: call. platform_device=0x%lx\n", __FUNCTION__, pdev); - int err; - struct resource *r; struct goldfish_pipe_dev *dev = pipe_dev; - - /* not thread safe, but this should not happen */ - WARN_ON(dev->base != NULL); - - spin_lock_init(&dev->lock); - - r = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (r == NULL || resource_size(r) < PAGE_SIZE) { - dev_err(&pdev->dev, "can't allocate i/o page\n"); - return -EINVAL; - } - dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE); - if (dev->base == NULL) { - dev_err(&pdev->dev, "ioremap failed\n"); - return -EINVAL; - } - - r = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (r == NULL) { - err = -EINVAL; - goto error; - } - dev->irq = r->start; - - err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt, + int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt, IRQF_SHARED, "goldfish_pipe", dev); if (err) { - dev_err(&pdev->dev, "unable to allocate IRQ\n"); - goto error; + dev_err(&pdev->dev, "unable to allocate IRQ for v1\n"); + return err; } - err = misc_register(&goldfish_pipe_device); + err = misc_register(&goldfish_pipe_dev); if (err) { - dev_err(&pdev->dev, "unable to register device\n"); - goto error; + dev_err(&pdev->dev, "unable to register v1 device\n"); + return err; } + setup_access_params_addr(pdev, dev); - - /* Although the pipe device in the classic Android emulator does not - * recognize the 'version' register, it won't treat this as an error - * either and will simply return 0, which is fine. */ - dev->version = readl(dev->base + PIPE_REG_VERSION); return 0; - -error: - dev->base = NULL; - return err; } -static int goldfish_pipe_remove(struct platform_device *pdev) +void goldfish_pipe_device_deinit_v1(struct platform_device *pdev) { - struct goldfish_pipe_dev *dev = pipe_dev; - misc_deregister(&goldfish_pipe_device); - dev->base = NULL; - return 0; + misc_deregister(&goldfish_pipe_dev); } - -static const struct acpi_device_id goldfish_pipe_acpi_match[] = { - { "GFSH0003", 0 }, - { }, -}; -MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match); - -static const struct of_device_id goldfish_pipe_of_match[] = { - { .compatible = "generic,android-pipe", }, - {}, -}; -MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match); - -static struct platform_driver goldfish_pipe = { - .probe = goldfish_pipe_probe, - .remove = goldfish_pipe_remove, - .driver = { - .name = "goldfish_pipe", - .of_match_table = goldfish_pipe_of_match, - .acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match), - } -}; - -module_platform_driver(goldfish_pipe); -MODULE_AUTHOR("David Turner "); -MODULE_LICENSE("GPL"); diff --git a/drivers/platform/goldfish/goldfish_pipe.h b/drivers/platform/goldfish/goldfish_pipe.h new file mode 100644 index 000000000000..9b75a51dba24 --- /dev/null +++ b/drivers/platform/goldfish/goldfish_pipe.h @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef GOLDFISH_PIPE_H +#define GOLDFISH_PIPE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Initialize the legacy version of the pipe device driver */ +int goldfish_pipe_device_init_v1(struct platform_device *pdev); + +/* Deinitialize the legacy version of the pipe device driver */ +void goldfish_pipe_device_deinit_v1(struct platform_device *pdev); + +/* Forward declarations for the device struct */ +struct goldfish_pipe; +struct goldfish_pipe_device_buffers; + +/* The global driver data. Holds a reference to the i/o page used to + * communicate with the emulator, and a wake queue for blocked tasks + * waiting to be awoken. + */ +struct goldfish_pipe_dev { + /* + * Global device spinlock. Protects the following members: + * - pipes, pipes_capacity + * - [*pipes, *pipes + pipes_capacity) - array data + * - first_signalled_pipe, + * goldfish_pipe::prev_signalled, + * goldfish_pipe::next_signalled, + * goldfish_pipe::signalled_flags - all singnalled-related fields, + * in all allocated pipes + * - open_command_params - PIPE_CMD_OPEN-related buffers + * + * It looks like a lot of different fields, but the trick is that the only + * operation that happens often is the signalled pipes array manipulation. + * That's why it's OK for now to keep the rest of the fields under the same + * lock. If we notice too much contention because of PIPE_CMD_OPEN, + * then we should add a separate lock there. + */ + spinlock_t lock; + + /* + * Array of the pipes of |pipes_capacity| elements, + * indexed by goldfish_pipe::id + */ + struct goldfish_pipe **pipes; + u32 pipes_capacity; + + /* Pointers to the buffers host uses for interaction with this driver */ + struct goldfish_pipe_dev_buffers *buffers; + + /* Head of a doubly linked list of signalled pipes */ + struct goldfish_pipe *first_signalled_pipe; + + /* Some device-specific data */ + int irq; + int version; + unsigned char __iomem *base; + + /* v1-specific access parameters */ + struct access_params *aps; +}; + +extern struct goldfish_pipe_dev pipe_dev[1]; + +#endif /* GOLDFISH_PIPE_H */ diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c new file mode 100644 index 000000000000..bdb039bc7dde --- /dev/null +++ b/drivers/platform/goldfish/goldfish_pipe_v2.c @@ -0,0 +1,888 @@ +/* + * Copyright (C) 2012 Intel, Inc. + * Copyright (C) 2013 Intel, Inc. + * Copyright (C) 2014 Linaro Limited + * Copyright (C) 2011-2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +/* This source file contains the implementation of a special device driver + * that intends to provide a *very* fast communication channel between the + * guest system and the QEMU emulator. + * + * Usage from the guest is simply the following (error handling simplified): + * + * int fd = open("/dev/qemu_pipe",O_RDWR); + * .... write() or read() through the pipe. + * + * This driver doesn't deal with the exact protocol used during the session. + * It is intended to be as simple as something like: + * + * // do this _just_ after opening the fd to connect to a specific + * // emulator service. + * const char* msg = ""; + * if (write(fd, msg, strlen(msg)+1) < 0) { + * ... could not connect to service + * close(fd); + * } + * + * // after this, simply read() and write() to communicate with the + * // service. Exact protocol details left as an exercise to the reader. + * + * This driver is very fast because it doesn't copy any data through + * intermediate buffers, since the emulator is capable of translating + * guest user addresses into host ones. + * + * Note that we must however ensure that each user page involved in the + * exchange is properly mapped during a transfer. + */ + +#include "goldfish_pipe.h" + + +/* + * Update this when something changes in the driver's behavior so the host + * can benefit from knowing it + */ +enum { + PIPE_DRIVER_VERSION = 2, + PIPE_CURRENT_DEVICE_VERSION = 2 +}; + +/* + * IMPORTANT: The following constants must match the ones used and defined + * in external/qemu/hw/goldfish_pipe.c in the Android source tree. + */ + +/* List of bitflags returned in status of CMD_POLL command */ +enum PipePollFlags { + PIPE_POLL_IN = 1 << 0, + PIPE_POLL_OUT = 1 << 1, + PIPE_POLL_HUP = 1 << 2 +}; + +/* Possible status values used to signal errors - see goldfish_pipe_error_convert */ +enum PipeErrors { + PIPE_ERROR_INVAL = -1, + PIPE_ERROR_AGAIN = -2, + PIPE_ERROR_NOMEM = -3, + PIPE_ERROR_IO = -4 +}; + +/* Bit-flags used to signal events from the emulator */ +enum PipeWakeFlags { + PIPE_WAKE_CLOSED = 1 << 0, /* emulator closed pipe */ + PIPE_WAKE_READ = 1 << 1, /* pipe can now be read from */ + PIPE_WAKE_WRITE = 1 << 2 /* pipe can now be written to */ +}; + +/* Bit flags for the 'flags' field */ +enum PipeFlagsBits { + BIT_CLOSED_ON_HOST = 0, /* pipe closed by host */ + BIT_WAKE_ON_WRITE = 1, /* want to be woken on writes */ + BIT_WAKE_ON_READ = 2, /* want to be woken on reads */ +}; + +enum PipeRegs { + PIPE_REG_CMD = 0, + + PIPE_REG_SIGNAL_BUFFER_HIGH = 4, + PIPE_REG_SIGNAL_BUFFER = 8, + PIPE_REG_SIGNAL_BUFFER_COUNT = 12, + + PIPE_REG_OPEN_BUFFER_HIGH = 20, + PIPE_REG_OPEN_BUFFER = 24, + + PIPE_REG_VERSION = 36, + + PIPE_REG_GET_SIGNALLED = 48, +}; + +enum PipeCmdCode { + PIPE_CMD_OPEN = 1, /* to be used by the pipe device itself */ + PIPE_CMD_CLOSE, + PIPE_CMD_POLL, + PIPE_CMD_WRITE, + PIPE_CMD_WAKE_ON_WRITE, + PIPE_CMD_READ, + PIPE_CMD_WAKE_ON_READ, + + /* + * TODO(zyy): implement a deferred read/write execution to allow parallel + * processing of pipe operations on the host. + */ + PIPE_CMD_WAKE_ON_DONE_IO, +}; + +enum { + MAX_BUFFERS_PER_COMMAND = 336, + MAX_SIGNALLED_PIPES = 64, + INITIAL_PIPES_CAPACITY = 64 +}; + +struct goldfish_pipe_dev; +struct goldfish_pipe; +struct goldfish_pipe_command; + +/* A per-pipe command structure, shared with the host */ +struct goldfish_pipe_command { + s32 cmd; /* PipeCmdCode, guest -> host */ + s32 id; /* pipe id, guest -> host */ + s32 status; /* command execution status, host -> guest */ + s32 reserved; /* to pad to 64-bit boundary */ + union { + /* Parameters for PIPE_CMD_{READ,WRITE} */ + struct { + u32 buffers_count; /* number of buffers, guest -> host */ + s32 consumed_size; /* number of consumed bytes, host -> guest */ + u64 ptrs[MAX_BUFFERS_PER_COMMAND]; /* buffer pointers, guest -> host */ + u32 sizes[MAX_BUFFERS_PER_COMMAND]; /* buffer sizes, guest -> host */ + } rw_params; + }; +}; + +/* A single signalled pipe information */ +struct signalled_pipe_buffer { + u32 id; + u32 flags; +}; + +/* Parameters for the PIPE_CMD_OPEN command */ +struct open_command_param { + u64 command_buffer_ptr; + u32 rw_params_max_count; +}; + +/* Device-level set of buffers shared with the host */ +struct goldfish_pipe_dev_buffers { + struct open_command_param open_command_params; + struct signalled_pipe_buffer signalled_pipe_buffers[MAX_SIGNALLED_PIPES]; +}; + +/* This data type models a given pipe instance */ +struct goldfish_pipe { + u32 id; /* pipe ID - index into goldfish_pipe_dev::pipes array */ + unsigned long flags; /* The wake flags pipe is waiting for + * Note: not protected with any lock, uses atomic operations + * and barriers to make it thread-safe. + */ + unsigned long signalled_flags; /* wake flags host have signalled, + * - protected by goldfish_pipe_dev::lock */ + + struct goldfish_pipe_command *command_buffer; /* A pointer to command buffer */ + + /* doubly linked list of signalled pipes, protected by goldfish_pipe_dev::lock */ + struct goldfish_pipe *prev_signalled; + struct goldfish_pipe *next_signalled; + + /* + * A pipe's own lock. Protects the following: + * - *command_buffer - makes sure a command can safely write its parameters + * to the host and read the results back. + */ + struct mutex lock; + + wait_queue_head_t wake_queue; /* A wake queue for sleeping until host signals an event */ + struct goldfish_pipe_dev *dev; /* Pointer to the parent goldfish_pipe_dev instance */ +}; + +struct goldfish_pipe_dev pipe_dev[1] = {}; + +static int goldfish_cmd_locked(struct goldfish_pipe *pipe, enum PipeCmdCode cmd) +{ + pipe->command_buffer->cmd = cmd; + pipe->command_buffer->status = PIPE_ERROR_INVAL; /* failure by default */ + writel(pipe->id, pipe->dev->base + PIPE_REG_CMD); + return pipe->command_buffer->status; +} + +static int goldfish_cmd(struct goldfish_pipe *pipe, enum PipeCmdCode cmd) +{ + int status; + if (mutex_lock_interruptible(&pipe->lock)) + return PIPE_ERROR_IO; + status = goldfish_cmd_locked(pipe, cmd); + mutex_unlock(&pipe->lock); + return status; +} + +/* + * This function converts an error code returned by the emulator through + * the PIPE_REG_STATUS i/o register into a valid negative errno value. + */ +static int goldfish_pipe_error_convert(int status) +{ + switch (status) { + case PIPE_ERROR_AGAIN: + return -EAGAIN; + case PIPE_ERROR_NOMEM: + return -ENOMEM; + case PIPE_ERROR_IO: + return -EIO; + default: + return -EINVAL; + } +} + +static int pin_user_pages(unsigned long first_page, unsigned long last_page, + unsigned last_page_size, int is_write, + struct page *pages[MAX_BUFFERS_PER_COMMAND], unsigned *iter_last_page_size) +{ + int ret; + int requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1; + if (requested_pages > MAX_BUFFERS_PER_COMMAND) { + requested_pages = MAX_BUFFERS_PER_COMMAND; + *iter_last_page_size = PAGE_SIZE; + } else { + *iter_last_page_size = last_page_size; + } + + ret = get_user_pages_fast( + first_page, requested_pages, !is_write, pages); + if (ret <= 0) + return -EFAULT; + if (ret < requested_pages) + *iter_last_page_size = PAGE_SIZE; + return ret; + +} + +static void release_user_pages(struct page **pages, int pages_count, + int is_write, s32 consumed_size) +{ + int i; + for (i = 0; i < pages_count; i++) { + if (!is_write && consumed_size > 0) { + set_page_dirty(pages[i]); + } + put_page(pages[i]); + } +} + +/* Populate the call parameters, merging adjacent pages together */ +static void populate_rw_params( + struct page **pages, int pages_count, + unsigned long address, unsigned long address_end, + unsigned long first_page, unsigned long last_page, + unsigned iter_last_page_size, int is_write, + struct goldfish_pipe_command *command) +{ + /* + * Process the first page separately - it's the only page that + * needs special handling for its start address. + */ + unsigned long xaddr = page_to_phys(pages[0]); + unsigned long xaddr_prev = xaddr; + int buffer_idx = 0; + int i = 1; + int size_on_page = first_page == last_page + ? (int)(address_end - address) + : (PAGE_SIZE - (address & ~PAGE_MASK)); + command->rw_params.ptrs[0] = (u64)(xaddr | (address & ~PAGE_MASK)); + command->rw_params.sizes[0] = size_on_page; + for (; i < pages_count; ++i) { + xaddr = page_to_phys(pages[i]); + size_on_page = (i == pages_count - 1) ? iter_last_page_size : PAGE_SIZE; + if (xaddr == xaddr_prev + PAGE_SIZE) { + command->rw_params.sizes[buffer_idx] += size_on_page; + } else { + ++buffer_idx; + command->rw_params.ptrs[buffer_idx] = (u64)xaddr; + command->rw_params.sizes[buffer_idx] = size_on_page; + } + xaddr_prev = xaddr; + } + command->rw_params.buffers_count = buffer_idx + 1; +} + +static int transfer_max_buffers(struct goldfish_pipe* pipe, + unsigned long address, unsigned long address_end, int is_write, + unsigned long last_page, unsigned int last_page_size, + s32* consumed_size, int* status) +{ + struct page *pages[MAX_BUFFERS_PER_COMMAND]; + unsigned long first_page = address & PAGE_MASK; + unsigned int iter_last_page_size; + int pages_count = pin_user_pages(first_page, last_page, + last_page_size, is_write, + pages, &iter_last_page_size); + if (pages_count < 0) + return pages_count; + + /* Serialize access to the pipe command buffers */ + if (mutex_lock_interruptible(&pipe->lock)) + return -ERESTARTSYS; + + populate_rw_params(pages, pages_count, address, address_end, + first_page, last_page, iter_last_page_size, is_write, + pipe->command_buffer); + + /* Transfer the data */ + *status = goldfish_cmd_locked(pipe, + is_write ? PIPE_CMD_WRITE : PIPE_CMD_READ); + + *consumed_size = pipe->command_buffer->rw_params.consumed_size; + + mutex_unlock(&pipe->lock); + + release_user_pages(pages, pages_count, is_write, *consumed_size); + + return 0; +} + +static int wait_for_host_signal(struct goldfish_pipe *pipe, int is_write) +{ + u32 wakeBit = is_write ? BIT_WAKE_ON_WRITE : BIT_WAKE_ON_READ; + set_bit(wakeBit, &pipe->flags); + + /* Tell the emulator we're going to wait for a wake event */ + (void)goldfish_cmd(pipe, + is_write ? PIPE_CMD_WAKE_ON_WRITE : PIPE_CMD_WAKE_ON_READ); + + while (test_bit(wakeBit, &pipe->flags)) { + if (wait_event_interruptible( + pipe->wake_queue, + !test_bit(wakeBit, &pipe->flags))) + return -ERESTARTSYS; + + if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)) + return -EIO; + } + + return 0; +} + +static ssize_t goldfish_pipe_read_write(struct file *filp, + char __user *buffer, size_t bufflen, int is_write) +{ + struct goldfish_pipe *pipe = filp->private_data; + int count = 0, ret = -EINVAL; + unsigned long address, address_end, last_page; + unsigned int last_page_size; + + /* If the emulator already closed the pipe, no need to go further */ + if (unlikely(test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))) + return -EIO; + /* Null reads or writes succeeds */ + if (unlikely(bufflen == 0)) + return 0; + /* Check the buffer range for access */ + if (unlikely(!access_ok(is_write ? VERIFY_WRITE : VERIFY_READ, + buffer, bufflen))) + return -EFAULT; + + address = (unsigned long)buffer; + address_end = address + bufflen; + last_page = (address_end - 1) & PAGE_MASK; + last_page_size = ((address_end - 1) & ~PAGE_MASK) + 1; + + while (address < address_end) { + s32 consumed_size; + int status; + ret = transfer_max_buffers(pipe, address, address_end, is_write, + last_page, last_page_size, &consumed_size, &status); + if (ret < 0) + break; + + if (consumed_size > 0) { + /* No matter what's the status, we've transfered something */ + count += consumed_size; + address += consumed_size; + } + if (status > 0) + continue; + if (status == 0) { + /* EOF */ + ret = 0; + break; + } + if (count > 0) { + /* + * An error occured, but we already transfered + * something on one of the previous iterations. + * Just return what we already copied and log this + * err. + */ + if (status != PIPE_ERROR_AGAIN) + pr_info_ratelimited("goldfish_pipe: backend error %d on %s\n", + status, is_write ? "write" : "read"); + break; + } + + /* + * If the error is not PIPE_ERROR_AGAIN, or if we are in + * non-blocking mode, just return the error code. + */ + if (status != PIPE_ERROR_AGAIN || (filp->f_flags & O_NONBLOCK) != 0) { + ret = goldfish_pipe_error_convert(status); + break; + } + + status = wait_for_host_signal(pipe, is_write); + if (status < 0) + return status; + } + + if (count > 0) + return count; + return ret; +} + +static ssize_t goldfish_pipe_read(struct file *filp, char __user *buffer, + size_t bufflen, loff_t *ppos) +{ + return goldfish_pipe_read_write(filp, buffer, bufflen, /* is_write */ 0); +} + +static ssize_t goldfish_pipe_write(struct file *filp, + const char __user *buffer, size_t bufflen, + loff_t *ppos) +{ + return goldfish_pipe_read_write(filp, + /* cast away the const */(char __user *)buffer, bufflen, + /* is_write */ 1); +} + +static unsigned int goldfish_pipe_poll(struct file *filp, poll_table *wait) +{ + struct goldfish_pipe *pipe = filp->private_data; + unsigned int mask = 0; + int status; + + poll_wait(filp, &pipe->wake_queue, wait); + + status = goldfish_cmd(pipe, PIPE_CMD_POLL); + if (status < 0) { + return -ERESTARTSYS; + } + + if (status & PIPE_POLL_IN) + mask |= POLLIN | POLLRDNORM; + if (status & PIPE_POLL_OUT) + mask |= POLLOUT | POLLWRNORM; + if (status & PIPE_POLL_HUP) + mask |= POLLHUP; + if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)) + mask |= POLLERR; + + return mask; +} + +static void signalled_pipes_add_locked(struct goldfish_pipe_dev *dev, + u32 id, u32 flags) +{ + struct goldfish_pipe *pipe; + + BUG_ON(id >= dev->pipes_capacity); + + pipe = dev->pipes[id]; + if (!pipe) + return; + pipe->signalled_flags |= flags; + + if (pipe->prev_signalled || pipe->next_signalled + || dev->first_signalled_pipe == pipe) + return; /* already in the list */ + pipe->next_signalled = dev->first_signalled_pipe; + if (dev->first_signalled_pipe) { + dev->first_signalled_pipe->prev_signalled = pipe; + } + dev->first_signalled_pipe = pipe; +} + +static void signalled_pipes_remove_locked(struct goldfish_pipe_dev *dev, + struct goldfish_pipe *pipe) { + if (pipe->prev_signalled) + pipe->prev_signalled->next_signalled = pipe->next_signalled; + if (pipe->next_signalled) + pipe->next_signalled->prev_signalled = pipe->prev_signalled; + if (pipe == dev->first_signalled_pipe) + dev->first_signalled_pipe = pipe->next_signalled; + pipe->prev_signalled = NULL; + pipe->next_signalled = NULL; +} + +static struct goldfish_pipe *signalled_pipes_pop_front(struct goldfish_pipe_dev *dev, + int *wakes) +{ + struct goldfish_pipe *pipe; + unsigned long flags; + spin_lock_irqsave(&dev->lock, flags); + + pipe = dev->first_signalled_pipe; + if (pipe) { + *wakes = pipe->signalled_flags; + pipe->signalled_flags = 0; + /* + * This is an optimized version of signalled_pipes_remove_locked() - + * we want to make it as fast as possible to wake the sleeping pipe + * operations faster + */ + dev->first_signalled_pipe = pipe->next_signalled; + if (dev->first_signalled_pipe) + dev->first_signalled_pipe->prev_signalled = NULL; + pipe->next_signalled = NULL; + } + + spin_unlock_irqrestore(&dev->lock, flags); + return pipe; +} + +static void goldfish_interrupt_task(unsigned long unused) +{ + struct goldfish_pipe_dev *dev = pipe_dev; + /* Iterate over the signalled pipes and wake them one by one */ + struct goldfish_pipe *pipe; + int wakes; + while ((pipe = signalled_pipes_pop_front(dev, &wakes)) != NULL) { + if (wakes & PIPE_WAKE_CLOSED) { + pipe->flags = 1 << BIT_CLOSED_ON_HOST; + } else { + if (wakes & PIPE_WAKE_READ) + clear_bit(BIT_WAKE_ON_READ, &pipe->flags); + if (wakes & PIPE_WAKE_WRITE) + clear_bit(BIT_WAKE_ON_WRITE, &pipe->flags); + } + /* + * wake_up_interruptible() implies a write barrier, so don't explicitly + * add another one here. + */ + wake_up_interruptible(&pipe->wake_queue); + } +} +DECLARE_TASKLET(goldfish_interrupt_tasklet, goldfish_interrupt_task, 0); + +/* + * The general idea of the interrupt handling: + * + * 1. device raises an interrupt if there's at least one signalled pipe + * 2. IRQ handler reads the signalled pipes and their count from the device + * 3. device writes them into a shared buffer and returns the count + * it only resets the IRQ if it has returned all signalled pipes, + * otherwise it leaves it raised, so IRQ handler will be called + * again for the next chunk + * 4. IRQ handler adds all returned pipes to the device's signalled pipes list + * 5. IRQ handler launches a tasklet to process the signalled pipes from the + * list in a separate context + */ +static irqreturn_t goldfish_pipe_interrupt(int irq, void *dev_id) +{ + u32 count; + u32 i; + unsigned long flags; + struct goldfish_pipe_dev *dev = dev_id; + if (dev != pipe_dev) + return IRQ_NONE; + + /* Request the signalled pipes from the device */ + spin_lock_irqsave(&dev->lock, flags); + + count = readl(dev->base + PIPE_REG_GET_SIGNALLED); + if (count == 0) { + spin_unlock_irqrestore(&dev->lock, flags); + return IRQ_NONE; + } + if (count > MAX_SIGNALLED_PIPES) + count = MAX_SIGNALLED_PIPES; + + for (i = 0; i < count; ++i) + signalled_pipes_add_locked(dev, + dev->buffers->signalled_pipe_buffers[i].id, + dev->buffers->signalled_pipe_buffers[i].flags); + + spin_unlock_irqrestore(&dev->lock, flags); + + tasklet_schedule(&goldfish_interrupt_tasklet); + return IRQ_HANDLED; +} + +static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev) +{ + int id; + for (id = 0; id < dev->pipes_capacity; ++id) + if (!dev->pipes[id]) + return id; + + { + /* Reallocate the array */ + u32 new_capacity = 2 * dev->pipes_capacity; + struct goldfish_pipe **pipes = + kcalloc(new_capacity, sizeof(*pipes), GFP_KERNEL); + if (!pipes) + return -ENOMEM; + memcpy(pipes, dev->pipes, sizeof(*pipes) * dev->pipes_capacity); + kfree(dev->pipes); + dev->pipes = pipes; + id = dev->pipes_capacity; + dev->pipes_capacity = new_capacity; + } + return id; +} + +/** + * goldfish_pipe_open - open a channel to the AVD + * @inode: inode of device + * @file: file struct of opener + * + * Create a new pipe link between the emulator and the use application. + * Each new request produces a new pipe. + * + * Note: we use the pipe ID as a mux. All goldfish emulations are 32bit + * right now so this is fine. A move to 64bit will need this addressing + */ +static int goldfish_pipe_open(struct inode *inode, struct file *file) +{ + struct goldfish_pipe_dev *dev = pipe_dev; + unsigned long flags; + int id; + int status; + + /* Allocate new pipe kernel object */ + struct goldfish_pipe *pipe = kzalloc(sizeof(*pipe), GFP_KERNEL); + if (pipe == NULL) + return -ENOMEM; + + pipe->dev = dev; + mutex_init(&pipe->lock); + init_waitqueue_head(&pipe->wake_queue); + + /* + * Command buffer needs to be allocated on its own page to make sure it is + * physically contiguous in host's address space. + */ + pipe->command_buffer = + (struct goldfish_pipe_command*)__get_free_page(GFP_KERNEL); + if (!pipe->command_buffer) { + status = -ENOMEM; + goto err_pipe; + } + + spin_lock_irqsave(&dev->lock, flags); + + id = get_free_pipe_id_locked(dev); + if (id < 0) { + status = id; + goto err_id_locked; + } + + dev->pipes[id] = pipe; + pipe->id = id; + pipe->command_buffer->id = id; + + /* Now tell the emulator we're opening a new pipe. */ + dev->buffers->open_command_params.rw_params_max_count = + MAX_BUFFERS_PER_COMMAND; + dev->buffers->open_command_params.command_buffer_ptr = + (u64)(unsigned long)__pa(pipe->command_buffer); + status = goldfish_cmd_locked(pipe, PIPE_CMD_OPEN); + spin_unlock_irqrestore(&dev->lock, flags); + if (status < 0) + goto err_cmd; + /* All is done, save the pipe into the file's private data field */ + file->private_data = pipe; + return 0; + +err_cmd: + spin_lock_irqsave(&dev->lock, flags); + dev->pipes[id] = NULL; +err_id_locked: + spin_unlock_irqrestore(&dev->lock, flags); + free_page((unsigned long)pipe->command_buffer); +err_pipe: + kfree(pipe); + return status; +} + +static int goldfish_pipe_release(struct inode *inode, struct file *filp) +{ + unsigned long flags; + struct goldfish_pipe *pipe = filp->private_data; + struct goldfish_pipe_dev *dev = pipe->dev; + + /* The guest is closing the channel, so tell the emulator right now */ + (void)goldfish_cmd(pipe, PIPE_CMD_CLOSE); + + spin_lock_irqsave(&dev->lock, flags); + dev->pipes[pipe->id] = NULL; + signalled_pipes_remove_locked(dev, pipe); + spin_unlock_irqrestore(&dev->lock, flags); + + filp->private_data = NULL; + free_page((unsigned long)pipe->command_buffer); + kfree(pipe); + return 0; +} + +static const struct file_operations goldfish_pipe_fops = { + .owner = THIS_MODULE, + .read = goldfish_pipe_read, + .write = goldfish_pipe_write, + .poll = goldfish_pipe_poll, + .open = goldfish_pipe_open, + .release = goldfish_pipe_release, +}; + +static struct miscdevice goldfish_pipe_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "goldfish_pipe", + .fops = &goldfish_pipe_fops, +}; + +static int goldfish_pipe_device_init_v2(struct platform_device *pdev) +{ + char *page; + struct goldfish_pipe_dev *dev = pipe_dev; + int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt, + IRQF_SHARED, "goldfish_pipe", dev); + if (err) { + dev_err(&pdev->dev, "unable to allocate IRQ for v2\n"); + return err; + } + + err = misc_register(&goldfish_pipe_dev); + if (err) { + dev_err(&pdev->dev, "unable to register v2 device\n"); + return err; + } + + dev->first_signalled_pipe = NULL; + dev->pipes_capacity = INITIAL_PIPES_CAPACITY; + dev->pipes = kcalloc(dev->pipes_capacity, sizeof(*dev->pipes), GFP_KERNEL); + if (!dev->pipes) + return -ENOMEM; + + /* + * We're going to pass two buffers, open_command_params and + * signalled_pipe_buffers, to the host. This means each of those buffers + * needs to be contained in a single physical page. The easiest choice is + * to just allocate a page and place the buffers in it. + */ + BUG_ON(sizeof(*dev->buffers) > PAGE_SIZE); + page = (char*)__get_free_page(GFP_KERNEL); + if (!page) { + kfree(dev->pipes); + return -ENOMEM; + } + dev->buffers = (struct goldfish_pipe_dev_buffers*)page; + + /* Send the buffer addresses to the host */ + { + u64 paddr = __pa(&dev->buffers->signalled_pipe_buffers); + writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_SIGNAL_BUFFER_HIGH); + writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_SIGNAL_BUFFER); + writel((u32)MAX_SIGNALLED_PIPES, dev->base + PIPE_REG_SIGNAL_BUFFER_COUNT); + + paddr = __pa(&dev->buffers->open_command_params); + writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_OPEN_BUFFER_HIGH); + writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_OPEN_BUFFER); + } + return 0; +} + +static void goldfish_pipe_device_deinit_v2(struct platform_device *pdev) { + struct goldfish_pipe_dev *dev = pipe_dev; + misc_deregister(&goldfish_pipe_dev); + kfree(dev->pipes); + free_page((unsigned long)dev->buffers); +} + +static int goldfish_pipe_probe(struct platform_device *pdev) +{ + int err; + struct resource *r; + struct goldfish_pipe_dev *dev = pipe_dev; + + BUG_ON(sizeof(struct goldfish_pipe_command) > PAGE_SIZE); + + /* not thread safe, but this should not happen */ + WARN_ON(dev->base != NULL); + + spin_lock_init(&dev->lock); + + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (r == NULL || resource_size(r) < PAGE_SIZE) { + dev_err(&pdev->dev, "can't allocate i/o page\n"); + return -EINVAL; + } + dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE); + if (dev->base == NULL) { + dev_err(&pdev->dev, "ioremap failed\n"); + return -EINVAL; + } + + r = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + if (r == NULL) { + err = -EINVAL; + goto error; + } + dev->irq = r->start; + + /* + * Exchange the versions with the host device + * + * Note: v1 driver used to not report its version, so we write it before + * reading device version back: this allows the host implementation to + * detect the old driver (if there was no version write before read). + */ + writel((u32)PIPE_DRIVER_VERSION, dev->base + PIPE_REG_VERSION); + dev->version = readl(dev->base + PIPE_REG_VERSION); + if (dev->version < PIPE_CURRENT_DEVICE_VERSION) { + /* initialize the old device version */ + err = goldfish_pipe_device_init_v1(pdev); + } else { + /* Host device supports the new interface */ + err = goldfish_pipe_device_init_v2(pdev); + } + if (!err) + return 0; + +error: + dev->base = NULL; + return err; +} + +static int goldfish_pipe_remove(struct platform_device *pdev) +{ + struct goldfish_pipe_dev *dev = pipe_dev; + if (dev->version < PIPE_CURRENT_DEVICE_VERSION) + goldfish_pipe_device_deinit_v1(pdev); + else + goldfish_pipe_device_deinit_v2(pdev); + dev->base = NULL; + return 0; +} + +static const struct acpi_device_id goldfish_pipe_acpi_match[] = { + { "GFSH0003", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match); + +static const struct of_device_id goldfish_pipe_of_match[] = { + { .compatible = "google,android-pipe", }, + {}, +}; +MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match); + +static struct platform_driver goldfish_pipe_driver = { + .probe = goldfish_pipe_probe, + .remove = goldfish_pipe_remove, + .driver = { + .name = "goldfish_pipe", + .of_match_table = goldfish_pipe_of_match, + .acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match), + } +}; + +module_platform_driver(goldfish_pipe_driver); +MODULE_AUTHOR("David Turner "); +MODULE_LICENSE("GPL"); From f5555c059ec1ee4a7298baa373aed789d35f0133 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Thu, 23 Jul 2015 10:40:57 -0700 Subject: [PATCH 0556/3220] ANDROID: arch: x86: disable pic for Android toolchain Android toolchains enable PIC, so explicitly disable it with -fno-pic (this is the upstream gcc default) Signed-off-by: Greg Hackmann (cherry picked from commit 892606ece2bebfa5a1ed62e9552cc973707ae9d3) Change-Id: I1e600363e5d18e459479fe4eb23d76855e16868d --- arch/x86/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4086abca0b32..53949c886341 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -97,6 +97,8 @@ else KBUILD_CFLAGS += $(call cc-option,-mno-80387) KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) + KBUILD_CFLAGS += -fno-pic + # Use -mpreferred-stack-boundary=3 if supported. KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) From 6297c6ba0d217d5b0998738fbfaff2f04cad77e6 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 17 Nov 2016 17:01:43 -0800 Subject: [PATCH 0557/3220] arm64: rename ranchu defconfig to ranchu64 Change-Id: Ib7cd1ef722167905957623f65c3cc064e9d5c357 --- arch/arm64/configs/{ranchu_defconfig => ranchu64_defconfig} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename arch/arm64/configs/{ranchu_defconfig => ranchu64_defconfig} (100%) diff --git a/arch/arm64/configs/ranchu_defconfig b/arch/arm64/configs/ranchu64_defconfig similarity index 100% rename from arch/arm64/configs/ranchu_defconfig rename to arch/arm64/configs/ranchu64_defconfig From 6502f5fcefadabb4ebd44583a11610751c3ff194 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 18 Nov 2016 07:26:19 +0100 Subject: [PATCH 0558/3220] ANDROID: goldfish_pipe: fix call_kern.cocci warnings Function get_free_pipe_id_locked called on line 671 inside lock on line 669 but uses GFP_KERNEL. Replace with GFP_ATOMIC. Generated by: scripts/coccinelle/locks/call_kern.cocci CC: Yurii Zubrytskyi Signed-off-by: Julia Lawall Signed-off-by: Fengguang Wu Signed-off-by: Guenter Roeck --- drivers/platform/goldfish/goldfish_pipe_v2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c index bdb039bc7dde..ad373ed36555 100644 --- a/drivers/platform/goldfish/goldfish_pipe_v2.c +++ b/drivers/platform/goldfish/goldfish_pipe_v2.c @@ -616,7 +616,8 @@ static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev) /* Reallocate the array */ u32 new_capacity = 2 * dev->pipes_capacity; struct goldfish_pipe **pipes = - kcalloc(new_capacity, sizeof(*pipes), GFP_KERNEL); + kcalloc(new_capacity, sizeof(*pipes), + GFP_ATOMIC); if (!pipes) return -ENOMEM; memcpy(pipes, dev->pipes, sizeof(*pipes) * dev->pipes_capacity); From 5f93f3cb3ab26abd6791bc1f01e6a9db73e65536 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Fri, 18 Nov 2016 13:16:07 +0800 Subject: [PATCH 0559/3220] ANDROID: video: goldfishfb: fix platform_no_drv_owner.cocci warnings drivers/video/fbdev/goldfishfb.c:318:3-8: No need to set .owner here. The core will do it. Remove .owner field if calls are used which set it automatically Generated by: scripts/coccinelle/api/platform_no_drv_owner.cocci CC: Greg Hackmann Signed-off-by: Fengguang Wu Signed-off-by: Guenter Roeck --- drivers/video/fbdev/goldfishfb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c index 131fee0341c3..1e56b50e4082 100644 --- a/drivers/video/fbdev/goldfishfb.c +++ b/drivers/video/fbdev/goldfishfb.c @@ -322,7 +322,6 @@ static struct platform_driver goldfish_fb_driver = { .remove = goldfish_fb_remove, .driver = { .name = "goldfish_fb", - .owner = THIS_MODULE, .of_match_table = goldfish_fb_of_match, .acpi_match_table = ACPI_PTR(goldfish_fb_acpi_match), } From 37e461fc4453488c394ccdbdcb8e286c122b047a Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Fri, 18 Nov 2016 11:09:02 -0800 Subject: [PATCH 0560/3220] ANDROID: goldfish: goldfish_pipe: fix locking errors If the get_user_pages_fast() call in goldfish_pipe_read_write() failed, it would return while still holding pipe->lock. goldfish_pipe_read_write() later releases and tries to re-acquire pipe->lock. If the re-acquire call failed, goldfish_pipe_read_write() would try unlock pipe->lock on exit anyway. This fixes the smatch messages: drivers/platform/goldfish/goldfish_pipe.c:392 goldfish_pipe_read_write() error: double unlock 'mutex:&pipe->lock' drivers/platform/goldfish/goldfish_pipe.c:397 goldfish_pipe_read_write() warn: inconsistent returns 'mutex:&pipe->lock'. Change-Id: Ifd06a76b32027ca451a001704ade0c5440ed69c4 Signed-off-by: Greg Hackmann --- drivers/platform/goldfish/goldfish_pipe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index cf7ce97e7346..fd1452e28352 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -273,11 +273,13 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, if (ret == 0) { DPRINT("%s: error: (requested pages == 0) (wanted %d)\n", __FUNCTION__, requested_pages); + mutex_unlock(&pipe->lock); return ret; } if (ret < 0) { DPRINT("%s: (requested pages < 0) %d \n", __FUNCTION__, requested_pages); + mutex_unlock(&pipe->lock); return ret; } @@ -384,10 +386,8 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer, } /* Try to re-acquire the lock */ - if (mutex_lock_interruptible(&pipe->lock)) { - ret = -ERESTARTSYS; - break; - } + if (mutex_lock_interruptible(&pipe->lock)) + return -ERESTARTSYS; } mutex_unlock(&pipe->lock); From dc1767cd0f28e0d1ba607bb4b5e397366ae87336 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Fri, 18 Nov 2016 11:40:40 -0800 Subject: [PATCH 0561/3220] ANDROID: goldfish_pipe: fix allmodconfig build tree: https://android.googlesource.com/kernel/common android-4.4 head: 6297c6ba0d217d5b0998738fbfaff2f04cad77e6 commit: bc43565e1ac5ba3f204886a2275726bb4c3d44e6 [18/20] ANDROID: goldfish_pipe: An implementation of more parallel pipe config: i386-randconfig-s1-201646 (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: git checkout bc43565e1ac5ba3f204886a2275726bb4c3d44e6 # save the attached .config to linux build tree make ARCH=i386 All errors (new ones prefixed by >>): >> ERROR: "goldfish_pipe_device_deinit_v1" [drivers/platform/goldfish/goldfish_pipe_v2.ko] undefined! >> ERROR: "goldfish_pipe_device_init_v1" [drivers/platform/goldfish/goldfish_pipe_v2.ko] undefined! >> ERROR: "pipe_dev" [drivers/platform/goldfish/goldfish_pipe.ko] undefined! Change-Id: Ibd51441edf82e6bb6824acc05ea795570cc374e8 --- drivers/platform/goldfish/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile index e53ae2fc717b..277a820ee4e1 100644 --- a/drivers/platform/goldfish/Makefile +++ b/drivers/platform/goldfish/Makefile @@ -2,4 +2,5 @@ # Makefile for Goldfish platform specific drivers # obj-$(CONFIG_GOLDFISH_BUS) += pdev_bus.o -obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe.o goldfish_pipe_v2.o +obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe_all.o +goldfish_pipe_all-objs := goldfish_pipe.o goldfish_pipe_v2.o From af80d7c3e91450fbcad01c497b394bf8ab01d37c Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Nov 2016 14:35:22 -0800 Subject: [PATCH 0562/3220] UPSTREAM: timekeeping: Add a fast and NMI safe boot clock This boot clock can be used as a tracing clock and will account for suspend time. To keep it NMI safe since we're accessing from tracing, we're not using a separate timekeeper with updates to monotonic clock and boot offset protected with seqlocks. This has the following minor side effects: (1) Its possible that a timestamp be taken after the boot offset is updated but before the timekeeper is updated. If this happens, the new boot offset is added to the old timekeeping making the clock appear to update slightly earlier: CPU 0 CPU 1 timekeeping_inject_sleeptime64() __timekeeping_inject_sleeptime(tk, delta); timestamp(); timekeeping_update(tk, TK_CLEAR_NTP...); (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be partially updated. Since the tk->offs_boot update is a rare event, this should be a rare occurrence which postprocessing should be able to handle. Bug: b/33184060 Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Thomas Gleixner Signed-off-by: Joel Fernandes Signed-off-by: John Stultz --- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ec89d846324c..b7246d2ed7c9 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -233,6 +233,7 @@ static inline u64 ktime_get_raw_ns(void) extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); +extern u64 ktime_get_boot_fast_ns(void); /* * Timespec interfaces utilizing the ktime based ones diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d563c1960302..7c92bad46761 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -402,6 +402,35 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + /* Suspend-time cycles value for halted fast timekeeper. */ static cycle_t cycles_at_suspend; From 5fe7d45f445841c6261e8695428a532db0e0377c Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Nov 2016 14:35:23 -0800 Subject: [PATCH 0563/3220] UPSTREAM: trace: Add an option for boot clock as trace clock Unlike monotonic clock, boot clock as a trace clock will account for time spent in suspend useful for tracing suspend/resume. This uses earlier introduced infrastructure for using the fast boot clock. Bug: b/33184060 Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Thomas Gleixner Signed-off-by: Joel Fernandes Signed-off-by: John Stultz Acked-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5f375d4c05fb..312557c04c10 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -890,6 +890,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; From 7c04e2fcb076dd28760f2eca2eaf48dd37e52a40 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Nov 2016 14:35:24 -0800 Subject: [PATCH 0564/3220] UPSTREAM: trace: Update documentation for mono, mono_raw and boot clock Documentation was missing for mono and mono_raw, add them and also for the boot clock introduced in this series. Bug: b/33184060 Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Thomas Gleixner Signed-off-by: Joel Fernandes Signed-off-by: John Stultz Acked-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index cf2f9e9dfe4d..fa16fb2302a5 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -357,6 +357,26 @@ of ftrace. Here is a list of some of the key files: to correlate events across hypervisor/guest if tb_offset is known. + mono: This uses the fast monotonic clock (CLOCK_MONOTONIC) + which is monotonic and is subject to NTP rate adjustments. + + mono_raw: + This is the raw monotonic clock (CLOCK_MONOTONIC_RAW) + which is montonic but is not subject to any rate adjustments + and ticks at the same rate as the hardware clocksource. + + boot: This is the boot clock (CLOCK_BOOTTIME) and is based on the + fast monotonic clock, but also accounts for time spent in + suspend. Since the clock access is designed for use in + tracing in the suspend path, some side effects are possible + if clock is accessed after the suspend time is accounted before + the fast mono clock is updated. In this case, the clock update + appears to happen slightly sooner than it normally would have. + Also on 32-bit systems, its possible that the 64-bit boot offset + sees a partial update. These effects are rare and post + processing should be able to handle them. See comments on + ktime_get_boot_fast_ns function for more information. + To set a clock, simply echo the clock name into this file. echo global > trace_clock From 8e53f7c02371eeef39805cc998eb73ac19a1320d Mon Sep 17 00:00:00 2001 From: Ke Wang Date: Fri, 25 Nov 2016 13:38:45 +0800 Subject: [PATCH 0565/3220] sched: tune: Fix lacking spinlock initialization The spinlock used by boost_groups in sched tune must be initialized. This commit fixes this lack and the following errors: [ 0.384739] c2 BUG: spinlock bad magic on CPU#2, swapper/2/0 [ 0.390313] c2 lock: 0xffffffc15fe1fc80, .magic:00000000, .owner: /-1, .owner_cpu: 0 [ 0.398739] c2 CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.4.6+ #4 [ 0.404816] c2 Hardware name: Spreadtrum SP9860gBoard (DT) [ 0.410462] c2 Call trace: [ 0.413159] c2 [] dump_backtrace+0x0/0x210 [ 0.418803] c2 [] show_stack+0x20/0x28 [ 0.424100] c2 [] dump_stack+0xa8/0xe0 [ 0.429398] c2 [] spin_dump+0x78/0x9c [ 0.434608] c2 [] spin_bug+0x30/0x3c [ 0.439644] c2 [] do_raw_spin_lock+0xac/0x1b4 [ 0.445639] c2 [] _raw_spin_lock_irqsave+0x58/0x68 [ 0.451977] c2 [] schedtune_enqueue_task+0x84/0x3bc [ 0.458320] c2 [] enqueue_task_fair+0x438/0x208c [ 0.464487] c2 [] activate_task+0x70/0xd0 [ 0.470130] c2 [] ttwu_do_activate.constprop.131+0x4c/0x98 [ 0.477079] c2 [] try_to_wake_up+0x254/0x54c [ 0.482899] c2 [] default_wake_function+0x30/0x3c [ 0.489154] c2 [] autoremove_wake_function+0x3c/0x6c [ 0.495754] c2 [] __wake_up_common+0x64/0xa4 [ 0.501574] c2 [] __wake_up+0x48/0x60 [ 0.506788] c2 [] rcu_gp_kthread_wake+0x50/0x5c [ 0.512866] c2 [] note_gp_changes+0xac/0xd4 [ 0.518597] c2 [] rcu_process_callbacks+0xe8/0x93c [ 0.524940] c2 [] __do_softirq+0x24c/0x5b8 [ 0.530584] c2 [] irq_exit+0xc0/0xec [ 0.535623] c2 [] __handle_domain_irq+0x94/0xf8 [ 0.541789] c2 [] gic_handle_irq+0x64/0xc0 Signed-off-by: Ke Wang --- kernel/sched/tune.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 68a24a044b0a..079b18802f17 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -725,6 +725,7 @@ schedtune_init_cgroups(void) for_each_possible_cpu(cpu) { bg = &per_cpu(cpu_boost_groups, cpu); memset(bg, 0, sizeof(struct boost_groups)); + raw_spin_lock_init(&bg->lock); } pr_info("schedtune: configured to support %d boost groups\n", From 25dcb75878cd3b685a70c688758a4d3726e92703 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Mon, 12 Sep 2016 15:51:35 -0700 Subject: [PATCH 0566/3220] build: add build server configs for goldfish Change-Id: Icd7a8d44df2b09394be5c6230c64ecb374cae236 --- build.config.goldfish.arm | 12 ++++++++++++ build.config.goldfish.arm64 | 12 ++++++++++++ build.config.goldfish.mips | 11 +++++++++++ build.config.goldfish.mips64 | 11 +++++++++++ build.config.goldfish.x86 | 12 ++++++++++++ build.config.goldfish.x86_64 | 12 ++++++++++++ 6 files changed, 70 insertions(+) create mode 100644 build.config.goldfish.arm create mode 100644 build.config.goldfish.arm64 create mode 100644 build.config.goldfish.mips create mode 100644 build.config.goldfish.mips64 create mode 100644 build.config.goldfish.x86 create mode 100644 build.config.goldfish.x86_64 diff --git a/build.config.goldfish.arm b/build.config.goldfish.arm new file mode 100644 index 000000000000..bab53668e033 --- /dev/null +++ b/build.config.goldfish.arm @@ -0,0 +1,12 @@ +ARCH=arm +BRANCH=android-4.4 +CROSS_COMPILE=arm-linux-androidkernel- +DEFCONFIG=ranchu_defconfig +EXTRA_CMDS='' +KERNEL_DIR=goldfish +LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin +FILES=" +arch/arm/boot/zImage +vmlinux +System.map +" diff --git a/build.config.goldfish.arm64 b/build.config.goldfish.arm64 new file mode 100644 index 000000000000..0b4c40604b76 --- /dev/null +++ b/build.config.goldfish.arm64 @@ -0,0 +1,12 @@ +ARCH=arm64 +BRANCH=android-4.4 +CROSS_COMPILE=aarch64-linux-android- +DEFCONFIG=ranchu64_defconfig +EXTRA_CMDS='' +KERNEL_DIR=goldfish +LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin +FILES=" +arch/arm64/boot/Image +vmlinux +System.map +" diff --git a/build.config.goldfish.mips b/build.config.goldfish.mips new file mode 100644 index 000000000000..5dcd8a181ec0 --- /dev/null +++ b/build.config.goldfish.mips @@ -0,0 +1,11 @@ +ARCH=mips +BRANCH=android-4.4 +CROSS_COMPILE=mips64el-linux-android- +DEFCONFIG=ranchu_defconfig +EXTRA_CMDS='' +KERNEL_DIR=goldfish +LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin +FILES=" +vmlinux +System.map +" diff --git a/build.config.goldfish.mips64 b/build.config.goldfish.mips64 new file mode 100644 index 000000000000..9c0b6cbfdb9b --- /dev/null +++ b/build.config.goldfish.mips64 @@ -0,0 +1,11 @@ +ARCH=mips +BRANCH=android-4.4 +CROSS_COMPILE=mips64el-linux-android- +DEFCONFIG=ranchu64_defconfig +EXTRA_CMDS='' +KERNEL_DIR=goldfish +LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin +FILES=" +vmlinux +System.map +" diff --git a/build.config.goldfish.x86 b/build.config.goldfish.x86 new file mode 100644 index 000000000000..2b8a9b75a14b --- /dev/null +++ b/build.config.goldfish.x86 @@ -0,0 +1,12 @@ +ARCH=x86 +BRANCH=android-4.4 +CROSS_COMPILE=x86_64-linux-android- +DEFCONFIG=i386_ranchu_defconfig +EXTRA_CMDS='' +KERNEL_DIR=goldfish +LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin +FILES=" +arch/x86/boot/bzImage +vmlinux +System.map +" diff --git a/build.config.goldfish.x86_64 b/build.config.goldfish.x86_64 new file mode 100644 index 000000000000..940caefc800f --- /dev/null +++ b/build.config.goldfish.x86_64 @@ -0,0 +1,12 @@ +ARCH=x86_64 +BRANCH=android-4.4 +CROSS_COMPILE=x86_64-linux-android- +DEFCONFIG=x86_64_ranchu_defconfig +EXTRA_CMDS='' +KERNEL_DIR=goldfish +LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin +FILES=" +arch/x86/boot/bzImage +vmlinux +System.map +" From ace74ccf82cfb2b73ce1df2e698d20c2fbc559dd Mon Sep 17 00:00:00 2001 From: Keun-young Park Date: Mon, 14 Nov 2016 18:25:15 -0800 Subject: [PATCH 0567/3220] ANDROID: dm verity: add minimum prefetch size - For device like eMMC, it gives better performance to read more hash blocks at a time. - For android, set it to default 128. For other devices, set it to 1 which is the same as now. - saved boot-up time by 300ms in tested device bug: 32246564 Cc: Sami Tolvanen Signed-off-by: Keun-young Park --- drivers/md/Kconfig | 16 ++++++++++++++++ drivers/md/dm-verity-target.c | 9 ++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 6035794bc1f2..3d237a03dab3 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -458,6 +458,21 @@ config DM_VERITY If unsure, say N. +config DM_VERITY_HASH_PREFETCH_MIN_SIZE_128 + bool "Prefetch size 128" + +config DM_VERITY_HASH_PREFETCH_MIN_SIZE + int "Verity hash prefetch minimum size" + depends on DM_VERITY + range 1 4096 + default 128 if DM_VERITY_HASH_PREFETCH_MIN_SIZE_128 + default 1 + ---help--- + This sets minimum number of hash blocks to prefetch for dm-verity. + For devices like eMMC, having larger prefetch size like 128 can improve + performance with increased memory consumption for keeping more hashes + in RAM. + config DM_VERITY_FEC bool "Verity forward error correction support" depends on DM_VERITY @@ -510,6 +525,7 @@ config DM_ANDROID_VERITY depends on ASYMMETRIC_KEY_TYPE depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE depends on MD_LINEAR + select DM_VERITY_HASH_PREFETCH_MIN_SIZE_128 ---help--- This device-mapper target is virtually a VERITY target. This target is setup by reading the metadata contents piggybacked diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 9d3d4b297201..c7e97cf6e7fb 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -501,6 +501,7 @@ static void verity_prefetch_io(struct work_struct *work) container_of(work, struct dm_verity_prefetch_work, work); struct dm_verity *v = pw->v; int i; + sector_t prefetch_size; for (i = v->levels - 2; i >= 0; i--) { sector_t hash_block_start; @@ -523,8 +524,14 @@ static void verity_prefetch_io(struct work_struct *work) hash_block_end = v->hash_blocks - 1; } no_prefetch_cluster: + // for emmc, it is more efficient to send bigger read + prefetch_size = max((sector_t)CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE, + hash_block_end - hash_block_start + 1); + if ((hash_block_start + prefetch_size) >= (v->hash_start + v->hash_blocks)) { + prefetch_size = hash_block_end - hash_block_start + 1; + } dm_bufio_prefetch(v->bufio, hash_block_start, - hash_block_end - hash_block_start + 1); + prefetch_size); } kfree(pw); From 4272b1a3c3f6a25f3c92f7fcbfcd6167cfbc1ced Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Wed, 7 Dec 2016 18:11:48 -0800 Subject: [PATCH 0568/3220] build: fix build config kernel_dir Change-Id: I88b87a9c85990b12dc8174349cfc14eddfb379d2 --- build.config.goldfish.arm | 2 +- build.config.goldfish.arm64 | 2 +- build.config.goldfish.mips | 2 +- build.config.goldfish.mips64 | 2 +- build.config.goldfish.x86 | 2 +- build.config.goldfish.x86_64 | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/build.config.goldfish.arm b/build.config.goldfish.arm index bab53668e033..866da9361b71 100644 --- a/build.config.goldfish.arm +++ b/build.config.goldfish.arm @@ -3,7 +3,7 @@ BRANCH=android-4.4 CROSS_COMPILE=arm-linux-androidkernel- DEFCONFIG=ranchu_defconfig EXTRA_CMDS='' -KERNEL_DIR=goldfish +KERNEL_DIR=common LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin FILES=" arch/arm/boot/zImage diff --git a/build.config.goldfish.arm64 b/build.config.goldfish.arm64 index 0b4c40604b76..9c963cf4a3d8 100644 --- a/build.config.goldfish.arm64 +++ b/build.config.goldfish.arm64 @@ -3,7 +3,7 @@ BRANCH=android-4.4 CROSS_COMPILE=aarch64-linux-android- DEFCONFIG=ranchu64_defconfig EXTRA_CMDS='' -KERNEL_DIR=goldfish +KERNEL_DIR=common LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin FILES=" arch/arm64/boot/Image diff --git a/build.config.goldfish.mips b/build.config.goldfish.mips index 5dcd8a181ec0..8af53d2c2940 100644 --- a/build.config.goldfish.mips +++ b/build.config.goldfish.mips @@ -3,7 +3,7 @@ BRANCH=android-4.4 CROSS_COMPILE=mips64el-linux-android- DEFCONFIG=ranchu_defconfig EXTRA_CMDS='' -KERNEL_DIR=goldfish +KERNEL_DIR=common LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin FILES=" vmlinux diff --git a/build.config.goldfish.mips64 b/build.config.goldfish.mips64 index 9c0b6cbfdb9b..2a33d36dc4c8 100644 --- a/build.config.goldfish.mips64 +++ b/build.config.goldfish.mips64 @@ -3,7 +3,7 @@ BRANCH=android-4.4 CROSS_COMPILE=mips64el-linux-android- DEFCONFIG=ranchu64_defconfig EXTRA_CMDS='' -KERNEL_DIR=goldfish +KERNEL_DIR=common LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin FILES=" vmlinux diff --git a/build.config.goldfish.x86 b/build.config.goldfish.x86 index 2b8a9b75a14b..f86253f58d4d 100644 --- a/build.config.goldfish.x86 +++ b/build.config.goldfish.x86 @@ -3,7 +3,7 @@ BRANCH=android-4.4 CROSS_COMPILE=x86_64-linux-android- DEFCONFIG=i386_ranchu_defconfig EXTRA_CMDS='' -KERNEL_DIR=goldfish +KERNEL_DIR=common LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin FILES=" arch/x86/boot/bzImage diff --git a/build.config.goldfish.x86_64 b/build.config.goldfish.x86_64 index 940caefc800f..e1738861ec5c 100644 --- a/build.config.goldfish.x86_64 +++ b/build.config.goldfish.x86_64 @@ -3,7 +3,7 @@ BRANCH=android-4.4 CROSS_COMPILE=x86_64-linux-android- DEFCONFIG=x86_64_ranchu_defconfig EXTRA_CMDS='' -KERNEL_DIR=goldfish +KERNEL_DIR=common LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin FILES=" arch/x86/boot/bzImage From 3a29814dae2302eda661514805d6fba32a8e3ba0 Mon Sep 17 00:00:00 2001 From: Ke Wang Date: Thu, 8 Dec 2016 14:02:10 +0800 Subject: [PATCH 0569/3220] sched: fix wrong truncation of walt_avg The result of "__entry->walt_avg = (__entry->demand << 10)" will exceed the range of "unsigned int", which will be truncated and make the trace looks like as follows: UnityMain-4588 [004] 6029.645672: walt_update_history: 4588(UnityMain): runtime 9928307 samples 1 event 4 demand 9928307 walt 157 pelt 870 (hist: 9928307 9604307 8440077 87392 34144328) cpu 4 UnityMain-4588 [004] 6029.653658: walt_update_history: 4588(UnityMain): runtime 10000000 samples 1 event 4 demand 10000000 walt 165 pelt 886 (hist: 10000000 9955691 6549308 64000 34144328) cpu 4 Fix this by using a u64 type instead of unsgined int type and make the trace as below: UnityMain-4617 [004] 117.613558: walt_update_history: 4617(UnityMain): runtime 5770597 samples 1 event 4 demand 7038739 walt 720 pelt 680 (hist: 5770597 7680001 8904509 65596 156) cpu 4 UnityMain-4617 [004] 117.633560: walt_update_history: 4617(UnityMain): runtime 9911238 samples 1 event 4 demand 9911238 walt 1014 pelt 769 (hist: 9911238 5770597 7680001 0 1664188058) cpu 4 Signed-off-by: Ke Wang --- include/trace/events/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index dffaffab4bc8..c18d8c89bd12 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -1044,7 +1044,7 @@ TRACE_EVENT(walt_update_history, __field( int, samples ) __field( int, evt ) __field( u64, demand ) - __field(unsigned int, walt_avg ) + __field( u64, walt_avg ) __field(unsigned int, pelt_avg ) __array( u32, hist, RAVG_HIST_SIZE_MAX) __field( int, cpu ) @@ -1066,7 +1066,7 @@ TRACE_EVENT(walt_update_history, ), TP_printk("%d (%s): runtime %u samples %d event %d demand %llu" - " walt %u pelt %u (hist: %u %u %u %u %u) cpu %d", + " walt %llu pelt %u (hist: %u %u %u %u %u) cpu %d", __entry->pid, __entry->comm, __entry->runtime, __entry->samples, __entry->evt, __entry->demand, From e487a24793bbf6d1ff2ba1c20575a9adabc13698 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 6 Dec 2016 11:50:53 +0000 Subject: [PATCH 0570/3220] sched/walt: kill {min,max}_capacity {min,max}_capacity are static variables that are only updated from __update_min_max_capacity(), but not used anywhere else. Remove them together with the function updating them. This has also the nice side effect of fixing a LOCKDEP warning related to locking all CPUs in update_min_max_capacity(), as reported by Ke Wang: [ 2.853595] c0 ============================================= [ 2.859219] c0 [ INFO: possible recursive locking detected ] [ 2.864852] c0 4.4.6+ #5 Tainted: G W [ 2.869604] c0 --------------------------------------------- [ 2.875230] c0 swapper/0/1 is trying to acquire lock: [ 2.880248] (&rq->lock){-.-.-.}, at: [] cpufreq_notifier_policy+0x2e8/0x37c [ 2.888815] c0 [ 2.888815] c0 but task is already holding lock: [ 2.895132] (&rq->lock){-.-.-.}, at: [] cpufreq_notifier_policy+0x2e8/0x37c [ 2.903700] c0 [ 2.903700] c0 other info that might help us debug this: [ 2.910710] c0 Possible unsafe locking scenario: [ 2.910710] c0 [ 2.917112] c0 CPU0 [ 2.919795] c0 ---- [ 2.922478] lock(&rq->lock); [ 2.925507] lock(&rq->lock); [ 2.928536] c0 [ 2.928536] c0 *** DEADLOCK *** [ 2.928536] c0 [ 2.935200] c0 May be due to missing lock nesting notation [ 2.935200] c0 [ 2.942471] c0 7 locks held by swapper/0/1: [ 2.946623] #0: (&dev->mutex){......}, at: [] __driver_attach+0x64/0xb8 [ 2.954931] #1: (&dev->mutex){......}, at: [] __driver_attach+0x74/0xb8 [ 2.963239] #2: (cpu_hotplug.lock){++++++}, at: [] get_online_cpus+0x48/0xa8 [ 2.971979] #3: (subsys mutex#6){+.+.+.}, at: [] subsys_interface_register+0x44/0xc0 [ 2.981411] #4: (&policy->rwsem){+.+.+.}, at: [] cpufreq_online+0x330/0x76c [ 2.990065] #5: ((cpufreq_policy_notifier_list).rwsem){.+.+..}, at: [] blocking_notifier_call_chain+0x38/0xc4 [ 3.001661] #6: (&rq->lock){-.-.-.}, at: [] cpufreq_notifier_policy+0x2e8/0x37c [ 3.010661] c0 [ 3.010661] c0 stack backtrace: [ 3.015514] c0 CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 4.4.6+ #5 [ 3.022864] c0 Hardware name: Spreadtrum SP9860g Board (DT) [ 3.028402] c0 Call trace: [ 3.031092] c0 [] dump_backtrace+0x0/0x210 [ 3.036716] c0 [] show_stack+0x20/0x28 [ 3.041994] c0 [] dump_stack+0xa8/0xe0 [ 3.047273] c0 [] __lock_acquire+0x1e0c/0x2218 [ 3.053243] c0 [] lock_acquire+0xe0/0x280 [ 3.058784] c0 [] _raw_spin_lock+0x44/0x58 [ 3.064407] c0 [] cpufreq_notifier_policy+0x2e8/0x37c [ 3.070983] c0 [] blocking_notifier_call_chain+0x78/0xc4 [ 3.077820] c0 [] cpufreq_online+0x28c/0x76c [ 3.083618] c0 [] cpufreq_add_dev+0x98/0xdc [ 3.089331] c0 [] subsys_interface_register+0x84/0xc0 [ 3.095907] c0 [] cpufreq_register_driver+0x168/0x28c [ 3.102486] c0 [] sprd_cpufreq_probe+0x134/0x19c [ 3.108629] c0 [] platform_drv_probe+0x58/0xd0 [ 3.114599] c0 [] driver_probe_device+0x1e8/0x470 [ 3.120830] c0 [] __driver_attach+0xb4/0xb8 [ 3.126541] c0 [] bus_for_each_dev+0x6c/0xac [ 3.132339] c0 [] driver_attach+0x2c/0x34 [ 3.137877] c0 [] bus_add_driver+0x210/0x298 [ 3.143676] c0 [] driver_register+0x7c/0x114 [ 3.149476] c0 [] __platform_driver_register+0x60/0x6c [ 3.156139] c0 [] sprd_cpufreq_platdrv_init+0x18/0x20 [ 3.162714] c0 [] do_one_initcall+0xd0/0x1d8 [ 3.168514] c0 [] kernel_init_freeable+0x1fc/0x29c [ 3.174834] c0 [] kernel_init+0x20/0x12c [ 3.180281] c0 [] ret_from_fork+0x10/0x40 Reported-by: Ke Wang Signed-off-by: Juri Lelli --- kernel/sched/walt.c | 45 +-------------------------------------------- 1 file changed, 1 insertion(+), 44 deletions(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 2ffb1680b380..6e053bd9830c 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -62,8 +62,6 @@ static unsigned int max_possible_freq = 1; */ static unsigned int min_max_freq = 1; -static unsigned int max_capacity = 1024; -static unsigned int min_capacity = 1024; static unsigned int max_load_scale_factor = 1024; static unsigned int max_possible_capacity = 1024; @@ -869,39 +867,6 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu) double_rq_unlock(src_rq, dest_rq); } -/* Keep track of max/min capacity possible across CPUs "currently" */ -static void __update_min_max_capacity(void) -{ - int i; - int max = 0, min = INT_MAX; - - for_each_online_cpu(i) { - if (cpu_rq(i)->capacity > max) - max = cpu_rq(i)->capacity; - if (cpu_rq(i)->capacity < min) - min = cpu_rq(i)->capacity; - } - - max_capacity = max; - min_capacity = min; -} - -static void update_min_max_capacity(void) -{ - unsigned long flags; - int i; - - local_irq_save(flags); - for_each_possible_cpu(i) - raw_spin_lock(&cpu_rq(i)->lock); - - __update_min_max_capacity(); - - for_each_possible_cpu(i) - raw_spin_unlock(&cpu_rq(i)->lock); - local_irq_restore(flags); -} - /* * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that * least efficient cpu gets capacity of 1024 @@ -984,15 +949,9 @@ static int cpufreq_notifier_policy(struct notifier_block *nb, /* Initialized to policy->max in case policy->related_cpus is empty! */ unsigned int orig_max_freq = policy->max; - if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY && - val != CPUFREQ_CREATE_POLICY) + if (val != CPUFREQ_NOTIFY) return 0; - if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) { - update_min_max_capacity(); - return 0; - } - for_each_cpu(i, policy->related_cpus) { cpumask_copy(&cpu_rq(i)->freq_domain_cpumask, policy->related_cpus); @@ -1082,8 +1041,6 @@ static int cpufreq_notifier_policy(struct notifier_block *nb, max_load_scale_factor = highest_mplsf; } - __update_min_max_capacity(); - return 0; } From 3313d27976a9f1fb3aa96487fdf11724a7f08cb3 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 8 Dec 2016 17:06:03 -0800 Subject: [PATCH 0571/3220] goldfish: enable CONFIG_INET_DIAG_DESTROY Bug: 31648368 Change-Id: I3715cc6474129ba2176be62ed2c0a7d09a6f2ac7 --- arch/arm/configs/ranchu_defconfig | 1 + arch/arm64/configs/ranchu64_defconfig | 1 + arch/x86/configs/i386_ranchu_defconfig | 2 +- arch/x86/configs/x86_64_ranchu_defconfig | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm/configs/ranchu_defconfig b/arch/arm/configs/ranchu_defconfig index 35a90af941a4..49e7bbd5825a 100644 --- a/arch/arm/configs/ranchu_defconfig +++ b/arch/arm/configs/ranchu_defconfig @@ -48,6 +48,7 @@ CONFIG_UNIX=y CONFIG_XFRM_USER=y CONFIG_NET_KEY=y CONFIG_INET=y +CONFIG_INET_DIAG_DESTROY=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTIPLE_TABLES=y diff --git a/arch/arm64/configs/ranchu64_defconfig b/arch/arm64/configs/ranchu64_defconfig index 00eb346e0928..fc55008d8c4c 100644 --- a/arch/arm64/configs/ranchu64_defconfig +++ b/arch/arm64/configs/ranchu64_defconfig @@ -50,6 +50,7 @@ CONFIG_UNIX=y CONFIG_XFRM_USER=y CONFIG_NET_KEY=y CONFIG_INET=y +CONFIG_INET_DIAG_DESTROY=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTIPLE_TABLES=y diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig index 0206eb8cfb61..65ed8c8f8444 100644 --- a/arch/x86/configs/i386_ranchu_defconfig +++ b/arch/x86/configs/i386_ranchu_defconfig @@ -89,7 +89,7 @@ CONFIG_SYN_COOKIES=y CONFIG_INET_ESP=y # CONFIG_INET_XFRM_MODE_BEET is not set # CONFIG_INET_LRO is not set -# CONFIG_INET_DIAG is not set +CONFIG_INET_DIAG_DESTROY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y CONFIG_IPV6_OPTIMISTIC_DAD=y diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig index dd389774bacb..d977bd91e390 100644 --- a/arch/x86/configs/x86_64_ranchu_defconfig +++ b/arch/x86/configs/x86_64_ranchu_defconfig @@ -87,7 +87,7 @@ CONFIG_SYN_COOKIES=y CONFIG_INET_ESP=y # CONFIG_INET_XFRM_MODE_BEET is not set # CONFIG_INET_LRO is not set -# CONFIG_INET_DIAG is not set +CONFIG_INET_DIAG_DESTROY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y CONFIG_IPV6_OPTIMISTIC_DAD=y From fde8582a59f5b6968d81d770d1fb22d3487392a3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 3 Mar 2016 15:10:59 +0100 Subject: [PATCH 0572/3220] UPSTREAM: arm64: enable CONFIG_DEBUG_RODATA by default (Cherry picked from commit 57efac2f7108e3255d0dfe512290c9896f4ed55f) In spite of its name, CONFIG_DEBUG_RODATA is an important hardening feature for production kernels, and distros all enable it by default in their kernel configs. However, since enabling it used to result in more granular, and thus less efficient kernel mappings, it is not enabled by default for performance reasons. However, since commit 2f39b5f91eb4 ("arm64: mm: Mark .rodata as RO"), the various kernel segments (.text, .rodata, .init and .data) are already mapped individually, and the only effect of setting CONFIG_DEBUG_RODATA is that the existing .text and .rodata mappings are updated late in the boot sequence to have their read-only attributes set, which means that any performance concerns related to enabling CONFIG_DEBUG_RODATA are no longer valid. So from now on, make CONFIG_DEBUG_RODATA default to 'y' Signed-off-by: Ard Biesheuvel Acked-by: Mark Rutland Acked-by: Kees Cook Signed-off-by: Catalin Marinas Signed-off-by: Amit Pundir --- arch/arm64/Kconfig.debug | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index 04fb73b973f1..8b0cd45de394 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -64,13 +64,13 @@ config DEBUG_SET_MODULE_RONX config DEBUG_RODATA bool "Make kernel text and rodata read-only" + default y help If this is set, kernel text and rodata will be made read-only. This is to help catch accidental or malicious attempts to change the - kernel's executable code. Additionally splits rodata from kernel - text so it can be made explicitly non-executable. + kernel's executable code. - If in doubt, say Y + If in doubt, say Y config DEBUG_ALIGN_RODATA depends on DEBUG_RODATA && ARM64_4K_PAGES From b571c4f0cf7e6dbcb2a51fbceefcf27518003b47 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 5 May 2016 10:43:59 +0100 Subject: [PATCH 0573/3220] UPSTREAM: arm64: Fix typo in the pmdp_huge_get_and_clear() definition (Cherry picked from commit 911f56eeb87ee378f5e215469268a7a2f68a5a8a) With hardware AF/DBM support, pmd modifications (transparent huge pages) should be performed atomically using load/store exclusive. The initial patches defined the get-and-clear function and __HAVE_ARCH_* macro without the "huge" word, leaving the pmdp_huge_get_and_clear() to the default, non-atomic implementation. Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits") Cc: # 4.3+ Reviewed-by: Will Deacon Signed-off-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Amit Pundir --- arch/arm64/include/asm/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 40017aa2fcbd..b420eee6026a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -652,9 +652,9 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long address, pmd_t *pmdp) +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) { return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp)); } From f82be531155c6ce5890df4ffe6fad9b1fa2eafac Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 5 May 2016 10:44:00 +0100 Subject: [PATCH 0574/3220] UPSTREAM: arm64: Implement pmdp_set_access_flags() for hardware AF/DBM (Cherry picked from commit 282aa7051b0169991b34716f0f22d9c2f59c46c4) The update to the accessed or dirty states for block mappings must be done atomically on hardware with support for automatic AF/DBM. The ptep_set_access_flags() function has been fixed as part of commit 66dbd6e61a52 ("arm64: Implement ptep_set_access_flags() for hardware AF/DBM"). This patch brings pmdp_set_access_flags() in line with the pte counterpart. Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits") Cc: # 4.4.x: 66dbd6e61a52: arm64: Implement ptep_set_access_flags() for hardware AF/DBM Cc: # 4.3+ Reviewed-by: Will Deacon Signed-off-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Amit Pundir --- arch/arm64/include/asm/pgtable.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b420eee6026a..22dbef8a677b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -600,6 +600,16 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +static inline int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty); +} +#endif + /* * Atomic pte/pmd modifications. */ From 18f41ad6976a772f283b6563963957b85d833e5b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 7 Jun 2016 17:55:15 +0100 Subject: [PATCH 0575/3220] UPSTREAM: arm64: mm: always take dirty state from new pte in ptep_set_access_flags (Cherry picked from commit 0106d456c4cb1770253fefc0ab23c9ca760b43f7) Commit 66dbd6e61a52 ("arm64: Implement ptep_set_access_flags() for hardware AF/DBM") ensured that pte flags are updated atomically in the face of potential concurrent, hardware-assisted updates. However, Alex reports that: | This patch breaks swapping for me. | In the broken case, you'll see either systemd cpu time spike (because | it's stuck in a page fault loop) or the system hang (because the | application owning the screen is stuck in a page fault loop). It turns out that this is because the 'dirty' argument to ptep_set_access_flags is always 0 for read faults, and so we can't use it to set PTE_RDONLY. The failing sequence is: 1. We put down a PTE_WRITE | PTE_DIRTY | PTE_AF pte 2. Memory pressure -> pte_mkold(pte) -> clear PTE_AF 3. A read faults due to the missing access flag 4. ptep_set_access_flags is called with dirty = 0, due to the read fault 5. pte is then made PTE_WRITE | PTE_DIRTY | PTE_AF | PTE_RDONLY (!) 6. A write faults, but pte_write is true so we get stuck The solution is to check the new page table entry (as would be done by the generic, non-atomic definition of ptep_set_access_flags that just calls set_pte_at) to establish the dirty state. Cc: # 4.3+ Fixes: 66dbd6e61a52 ("arm64: Implement ptep_set_access_flags() for hardware AF/DBM") Reviewed-by: Catalin Marinas Reported-by: Alexander Graf Tested-by: Alexander Graf Signed-off-by: Will Deacon Fixes: Change-Id: Id2a0b0d8eb6e7df6325ecb48b88b8401a5dd09e5 ("UPSTREAM: arm64: Implement ptep_set_access_flags() for hardware AF/DBM") Signed-off-by: Amit Pundir --- arch/arm64/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index f0c75fd6a3fa..9cedb10b1107 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -109,7 +109,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, * PTE_RDONLY is cleared by default in the asm below, so set it in * back if necessary (read-only or clean PTE). */ - if (!pte_write(entry) || !dirty) + if (!pte_write(entry) || !pte_sw_dirty(entry)) pte_val(entry) |= PTE_RDONLY; /* From 61f26de8c058833bb29aa4641717fc5b873724b5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 30 Mar 2016 09:46:23 +0200 Subject: [PATCH 0576/3220] UPSTREAM: efi/arm64: Don't apply MEMBLOCK_NOMAP to UEFI memory map mapping (Cherry picked from commit 7cc8cbcf82d165dd658d89a7a287140948e76413) Commit 4dffbfc48d65 ("arm64/efi: mark UEFI reserved regions as MEMBLOCK_NOMAP") updated the mapping logic of both the RuntimeServices regions as well as the kernel's copy of the UEFI memory map to set the MEMBLOCK_NOMAP flag, which causes these regions to be omitted from the kernel direct mapping, and from being covered by a struct page. For the RuntimeServices regions, this is an obvious win, since the contents of these regions have significance to the firmware executable code itself, and are mapped in the EFI page tables using attributes that are described in the UEFI memory map, and which may differ from the attributes we use for mapping system RAM. It also prevents the contents from being modified inadvertently, since the EFI page tables are only live during runtime service invocations. None of these concerns apply to the allocation that covers the UEFI memory map, since it is entirely owned by the kernel. Setting the MEMBLOCK_NOMAP on the region did allow us to use ioremap_cache() to map it both on arm64 and on ARM, since the latter does not allow ioremap_cache() to be used on regions that are covered by a struct page. The ioremap_cache() on ARM restriction will be lifted in the v4.7 timeframe, but in the mean time, it has been reported that commit 4dffbfc48d65 causes a regression on 64k granule kernels. This is due to the fact that, given the 64 KB page size, the region that we end up removing from the kernel direct mapping is rounded up to 64 KB, and this 64 KB page frame may be shared with the initrd when booting via GRUB (which does not align its EFI_LOADER_DATA allocations to 64 KB like the stub does). This will crash the kernel as soon as it tries to access the initrd. Since the issue is specific to arm64, revert back to memblock_reserve()'ing the UEFI memory map when running on arm64. This is a temporary fix for v4.5 and v4.6, and will be superseded in the v4.7 timeframe when we will be able to move back to memblock_reserve() unconditionally. Fixes: 4dffbfc48d65 ("arm64/efi: mark UEFI reserved regions as MEMBLOCK_NOMAP") Reported-by: Mark Salter Signed-off-by: Ard Biesheuvel Acked-by: Will Deacon Cc: Leif Lindholm Cc: Mark Rutland Cc: Jeremy Linton Cc: Mark Langsdorf Cc: # v4.5 Signed-off-by: Matt Fleming Fixes: Change-Id: Ia3ce78f40f8d41a9afdd42238fe9cbfd81bbff08 ("UPSTREAM: arm64/efi: mark UEFI reserved regions as MEMBLOCK_NOMAP") Signed-off-by: Amit Pundir --- drivers/firmware/efi/arm-init.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 9e15d571b53c..a76c35fc0b92 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -203,7 +203,19 @@ void __init efi_init(void) reserve_regions(); early_memunmap(memmap.map, params.mmap_size); - memblock_mark_nomap(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + - (params.mmap & ~PAGE_MASK))); + + if (IS_ENABLED(CONFIG_ARM)) { + /* + * ARM currently does not allow ioremap_cache() to be called on + * memory regions that are covered by struct page. So remove the + * UEFI memory map from the linear mapping. + */ + memblock_mark_nomap(params.mmap & PAGE_MASK, + PAGE_ALIGN(params.mmap_size + + (params.mmap & ~PAGE_MASK))); + } else { + memblock_reserve(params.mmap & PAGE_MASK, + PAGE_ALIGN(params.mmap_size + + (params.mmap & ~PAGE_MASK))); + } } From 9789b697c6e4db733b4afe9572cd8e053c63e943 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Mon, 12 Dec 2016 11:41:11 +0900 Subject: [PATCH 0577/3220] Revert "net: core: Support UID-based routing." This reverts commit fd2cf795f3ab193752781be7372949ac1780d0ed. Bug: 16355602 Change-Id: I1ec2d1eb3d53f4186b60c6ca5d6a20fcca46d442 --- include/net/fib_rules.h | 4 --- include/net/flow.h | 9 +----- include/net/ip.h | 1 - include/net/ip6_route.h | 2 +- include/net/route.h | 6 ++-- include/uapi/linux/fib_rules.h | 2 -- include/uapi/linux/rtnetlink.h | 1 - net/core/fib_rules.c | 53 ++------------------------------ net/ipv4/fib_frontend.c | 1 - net/ipv4/inet_connection_sock.c | 4 +-- net/ipv4/ip_output.c | 3 +- net/ipv4/ping.c | 3 +- net/ipv4/raw.c | 3 +- net/ipv4/route.c | 25 ++++----------- net/ipv4/syncookies.c | 5 ++- net/ipv4/udp.c | 3 +- net/ipv6/af_inet6.c | 1 - net/ipv6/ah6.c | 2 +- net/ipv6/datagram.c | 1 - net/ipv6/esp6.c | 2 +- net/ipv6/icmp.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 -- net/ipv6/ip6_vti.c | 2 +- net/ipv6/ipcomp6.c | 2 +- net/ipv6/ping.c | 1 - net/ipv6/raw.c | 1 - net/ipv6/route.c | 11 ++----- net/ipv6/syncookies.c | 1 - net/ipv6/tcp_ipv6.c | 1 - net/ipv6/udp.c | 1 - 30 files changed, 27 insertions(+), 128 deletions(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index bdd985f41022..59160de702b6 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -29,8 +29,6 @@ struct fib_rule { int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; - kuid_t uid_start; - kuid_t uid_end; struct rcu_head rcu; }; @@ -89,8 +87,6 @@ struct fib_rules_ops { [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ - [FRA_UID_START] = { .type = NLA_U32 }, \ - [FRA_UID_END] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 } diff --git a/include/net/flow.h b/include/net/flow.h index 833080732dec..83969eebebf3 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -11,7 +11,6 @@ #include #include #include -#include /* * ifindex generation is per-net namespace, and loopback is @@ -39,7 +38,6 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x08 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; - kuid_t flowic_uid; }; union flowi_uli { @@ -77,7 +75,6 @@ struct flowi4 { #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key -#define flowi4_uid __fl_common.flowic_uid /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -97,8 +94,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, __u32 mark, __u8 tos, __u8 scope, __u8 proto, __u8 flags, __be32 daddr, __be32 saddr, - __be16 dport, __be16 sport, - kuid_t uid) + __be16 dport, __be16 sport) { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; @@ -109,7 +105,6 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; - fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; @@ -138,7 +133,6 @@ struct flowi6 { #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key -#define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; __be32 flowlabel; @@ -183,7 +177,6 @@ struct flowi { #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key -#define flowi_uid u.__fl_common.flowic_uid } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) diff --git a/include/net/ip.h b/include/net/ip.h index 4f3ef345f4c2..1a98f1ca1638 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -170,7 +170,6 @@ struct ip_reply_arg { /* -1 if not needed */ int bound_dev_if; u8 tos; - kuid_t uid; }; #define IP_REPLY_ARG_NOSRCCHECK 1 diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 4bbd221637cd..877f682989b8 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -108,7 +108,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr); void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, - u32 mark, kuid_t uid); + u32 mark); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu); void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark); void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, diff --git a/include/net/route.h b/include/net/route.h index d016a8cb45cf..a3b9ef74a389 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -154,8 +154,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos, RT_SCOPE_UNIVERSE, proto, sk ? inet_sk_flowi_flags(sk) : 0, - daddr, saddr, dport, sport, - sk ? sock_i_uid(sk) : GLOBAL_ROOT_UID); + daddr, saddr, dport, sport); if (sk) security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); return ip_route_output_flow(net, fl4, sk); @@ -268,8 +267,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 flow_flags |= FLOWI_FLAG_ANYSRC; flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - protocol, flow_flags, dst, src, dport, sport, - sock_i_uid(sk)); + protocol, flow_flags, dst, src, dport, sport); } static inline struct rtable *ip_route_connect(struct flowi4 *fl4, diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index ce19c5bf51f7..96161b8202b5 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -49,8 +49,6 @@ enum { FRA_TABLE, /* Extended table id */ FRA_FWMASK, /* mask for netfilter mark */ FRA_OIFNAME, - FRA_UID_START, /* UID range */ - FRA_UID_END, __FRA_MAX }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 3eb02a1d6d8c..123a5af4e8bb 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -306,7 +306,6 @@ enum rtattr_type_t { RTA_TABLE, RTA_MARK, RTA_MFC_STATS, - RTA_UID, RTA_VIA, RTA_NEWDST, RTA_PREF, diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 3fbd839f6d20..365de66436ac 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -33,8 +33,6 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->table = table; r->flags = flags; r->fr_net = ops->fro_net; - r->uid_start = INVALID_UID; - r->uid_end = INVALID_UID; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; @@ -174,23 +172,6 @@ void fib_rules_unregister(struct fib_rules_ops *ops) } EXPORT_SYMBOL_GPL(fib_rules_unregister); -static inline kuid_t fib_nl_uid(struct nlattr *nla) -{ - return make_kuid(current_user_ns(), nla_get_u32(nla)); -} - -static int nla_put_uid(struct sk_buff *skb, int idx, kuid_t uid) -{ - return nla_put_u32(skb, idx, from_kuid_munged(current_user_ns(), uid)); -} - -static int fib_uid_range_match(struct flowi *fl, struct fib_rule *rule) -{ - return (!uid_valid(rule->uid_start) && !uid_valid(rule->uid_end)) || - (uid_gte(fl->flowi_uid, rule->uid_start) && - uid_lte(fl->flowi_uid, rule->uid_end)); -} - static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags) { @@ -208,9 +189,6 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; - if (!fib_uid_range_match(fl, rule)) - goto out; - ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -393,19 +371,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) } else if (rule->action == FR_ACT_GOTO) goto errout_free; - /* UID start and end must either both be valid or both unspecified. */ - rule->uid_start = rule->uid_end = INVALID_UID; - if (tb[FRA_UID_START] || tb[FRA_UID_END]) { - if (tb[FRA_UID_START] && tb[FRA_UID_END]) { - rule->uid_start = fib_nl_uid(tb[FRA_UID_START]); - rule->uid_end = fib_nl_uid(tb[FRA_UID_END]); - } - if (!uid_valid(rule->uid_start) || - !uid_valid(rule->uid_end) || - !uid_lte(rule->uid_start, rule->uid_end)) - goto errout_free; - } - err = ops->configure(rule, skb, frh, tb); if (err < 0) goto errout_free; @@ -518,14 +483,6 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) continue; - if (tb[FRA_UID_START] && - !uid_eq(rule->uid_start, fib_nl_uid(tb[FRA_UID_START]))) - continue; - - if (tb[FRA_UID_END] && - !uid_eq(rule->uid_end, fib_nl_uid(tb[FRA_UID_END]))) - continue; - if (!ops->compare(rule, frh, tb)) continue; @@ -592,9 +549,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ - + nla_total_size(8) /* FRA_TUN_ID */ - + nla_total_size(4) /* FRA_UID_START */ - + nla_total_size(4); /* FRA_UID_END */ + + nla_total_size(8); /* FRA_TUN_ID */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -652,11 +607,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && - nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)) || - (uid_valid(rule->uid_start) && - nla_put_uid(skb, FRA_UID_START, rule->uid_start)) || - (uid_valid(rule->uid_end) && - nla_put_uid(skb, FRA_UID_END, rule->uid_end))) + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e10edb5e78b0..473447593060 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -629,7 +629,6 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, - [RTA_UID] = { .type = NLA_U32 }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 728414dcea3b..46b9c887bede 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -420,7 +420,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num), sock_i_uid((struct sock *)sk)); + htons(ireq->ir_num)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -457,7 +457,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num), sock_i_uid((struct sock *)sk)); + htons(ireq->ir_num)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 33bef2763c72..4233cbe47052 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1573,8 +1573,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, - tcp_hdr(skb)->source, tcp_hdr(skb)->dest, - arg->uid); + tcp_hdr(skb)->source, tcp_hdr(skb)->dest); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b27e98010dea..e89094ab5ddb 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -789,8 +789,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, - sock_i_uid(sk)); + inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 865895d3fb27..bc35f1842512 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -599,8 +599,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), - daddr, saddr, 0, 0, - sock_i_uid(sk)); + daddr, saddr, 0, 0); if (!saddr && ipc.oif) { err = l3mdev_get_saddr(net, ipc.oif, &fl4); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a0d842f4e9cf..85f184e429c6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -500,7 +500,7 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) } EXPORT_SYMBOL(__ip_select_ident); -static void __build_flow_key(struct flowi4 *fl4, struct sock *sk, +static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, u8 tos, u8 prot, u32 mark, int flow_flags) @@ -516,12 +516,11 @@ static void __build_flow_key(struct flowi4 *fl4, struct sock *sk, flowi4_init_output(fl4, oif, mark, tos, RT_SCOPE_UNIVERSE, prot, flow_flags, - iph->daddr, iph->saddr, 0, 0, - sk ? sock_i_uid(sk) : GLOBAL_ROOT_UID); + iph->daddr, iph->saddr, 0, 0); } static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, - struct sock *sk) + const struct sock *sk) { const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; @@ -532,7 +531,7 @@ static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); } -static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk) +static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; @@ -546,12 +545,11 @@ static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk) RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), - daddr, inet->inet_saddr, 0, 0, - sock_i_uid(sk)); + daddr, inet->inet_saddr, 0, 0); rcu_read_unlock(); } -static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk, +static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct sk_buff *skb) { if (skb) @@ -2424,11 +2422,6 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; - if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && - nla_put_u32(skb, RTA_UID, - from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) - goto nla_put_failure; - error = rt->dst.error; if (rt_is_input_route(rt)) { @@ -2480,7 +2473,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) int mark; struct sk_buff *skb; u32 table_id = RT_TABLE_MAIN; - kuid_t uid; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) @@ -2508,10 +2500,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; - if (tb[RTA_UID]) - uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); - else - uid = (iif ? INVALID_UID : current_uid()); memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; @@ -2519,7 +2507,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_tos = rtm->rtm_tos; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; - fl4.flowi4_uid = uid; if (netif_index_is_l3_master(net, fl4.flowi4_oif)) fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 31b6a4c9db32..4cbe9f0a4281 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -374,9 +374,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), - (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, th->source, th->dest, - sock_i_uid(sk)); + opt->srr ? opt->faddr : ireq->ir_rmt_addr, + ireq->ir_loc_addr, th->source, th->dest); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 660933edd2d2..f8b3842b9070 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1023,8 +1023,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, - faddr, saddr, dport, inet->inet_sport, - sock_i_uid(sk)); + faddr, saddr, dport, inet->inet_sport); if (!saddr && ipc.oif) { err = l3mdev_get_saddr(net, ipc.oif, fl4); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d9b25bd17bf1..9dbfacb6e0d9 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -678,7 +678,6 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; - fl6.flowi6_uid = sock_i_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); rcu_read_lock(); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index c52b8fc904c9..0630a4d5daaa 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -664,7 +664,7 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0); else - ip6_update_pmtu(skb, net, info, 0, 0, INVALID_UID); + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); return 0; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 0743a5f4c533..517c55b01ba8 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -161,7 +161,6 @@ ipv4_connected: fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; - fl6.flowi6_uid = sock_i_uid(sk); if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) fl6.flowi6_oif = np->mcast_oif; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index f921368c32c9..060a60b2f8a6 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -476,7 +476,7 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0); else - ip6_update_pmtu(skb, net, info, 0, 0, INVALID_UID); + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); return 0; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 41e5c9520c7d..3697cd08c515 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -92,7 +92,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct net *net = dev_net(skb->dev); if (type == ICMPV6_PKT_TOOBIG) - ip6_update_pmtu(skb, net, info, 0, 0, INVALID_UID); + ip6_update_pmtu(skb, net, info, 0, 0); else if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 897bb6eb5751..a7ca2cde2ecb 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -86,7 +86,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_mark = ireq->ir_mark; fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); - fl6->flowi6_uid = sock_i_uid((struct sock *)sk); security_req_classify_flow(req, flowi6_to_flowi(fl6)); dst = ip6_dst_lookup_flow(sk, fl6, final_p); @@ -135,7 +134,6 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->flowi6_mark = sk->sk_mark; fl6->fl6_sport = inet->inet_sport; fl6->fl6_dport = inet->inet_dport; - fl6->flowi6_uid = sock_i_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); rcu_read_lock(); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index c76ebc7fc52d..0a8610b33d79 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -599,7 +599,7 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0); else - ip6_update_pmtu(skb, net, info, 0, 0, INVALID_UID); + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); return 0; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index b247baceb797..1b9316e1386a 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -76,7 +76,7 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0); else - ip6_update_pmtu(skb, net, info, 0, 0, INVALID_UID); + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); return 0; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 9411c8d770a5..a5cf82ccf406 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -141,7 +141,6 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.daddr = *daddr; fl6.flowi6_oif = oif; fl6.flowi6_mark = sk->sk_mark; - fl6.flowi6_uid = sock_i_uid(sk); fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index d9ad71a01b4c..99140986e887 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -768,7 +768,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = sk->sk_mark; - fl6.flowi6_uid = sock_i_uid(sk); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 02ba70201e05..60359bea6a16 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1383,7 +1383,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, } void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, - int oif, u32 mark, kuid_t uid) + int oif, u32 mark) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; @@ -1395,7 +1395,6 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); - fl6.flowi6_uid = uid; dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) @@ -1407,7 +1406,7 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { ip6_update_pmtu(skb, sock_net(sk), mtu, - sk->sk_bound_dev_if, sk->sk_mark, sock_i_uid(sk)); + sk->sk_bound_dev_if, sk->sk_mark); } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); @@ -2686,7 +2685,6 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_PREF] = { .type = NLA_U8 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, - [RTA_UID] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3248,11 +3246,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) if (tb[RTA_MARK]) fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); - if (tb[RTA_UID]) - fl6.flowi6_uid = make_kuid(current_user_ns(), - nla_get_u32(tb[RTA_UID])); - else - fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); if (iif) { struct net_device *dev; int flags = 0; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index a22015fab95e..eaf7ac496d50 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -228,7 +228,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; - fl6.flowi6_uid = sock_i_uid(sk); security_req_classify_flow(req, flowi6_to_flowi(&fl6)); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e6b044480333..f6eb65a5d343 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -234,7 +234,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - fl6.flowi6_uid = sock_i_uid(sk); opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); final_p = fl6_update_dst(&fl6, opt, &final); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 54235ef177bb..fcce7b4ba8ff 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1243,7 +1243,6 @@ do_udp_sendmsg: fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; fl6.flowi6_mark = sk->sk_mark; - fl6.flowi6_uid = sock_i_uid(sk); if (msg->msg_controllen) { opt = &opt_space; From eb964bdba79aee0f244efef0730d9d022ccc9ac8 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 4 Nov 2016 02:23:41 +0900 Subject: [PATCH 0578/3220] net: core: Add a UID field to struct sock. Protocol sockets (struct sock) don't have UIDs, but most of the time, they map 1:1 to userspace sockets (struct socket) which do. Various operations such as the iptables xt_owner match need access to the "UID of a socket", and do so by following the backpointer to the struct socket. This involves taking sk_callback_lock and doesn't work when there is no socket because userspace has already called close(). Simplify this by adding a sk_uid field to struct sock whose value matches the UID of the corresponding struct socket. The semantics are as follows: 1. Whenever sk_socket is non-null: sk_uid is the same as the UID in sk_socket, i.e., matches the return value of sock_i_uid. Specifically, the UID is set when userspace calls socket(), fchown(), or accept(). 2. When sk_socket is NULL, sk_uid is defined as follows: - For a socket that no longer has a sk_socket because userspace has called close(): the previous UID. - For a cloned socket (e.g., an incoming connection that is established but on which userspace has not yet called accept): the UID of the socket it was cloned from. - For a socket that has never had an sk_socket: UID 0 inside the user namespace corresponding to the network namespace the socket belongs to. Kernel sockets created by sock_create_kern are a special case of #1 and sk_uid is the user that created them. For kernel sockets created at network namespace creation time, such as the per-processor ICMP and TCP sockets, this is the user that created the network namespace. Bug: 16355602 Change-Id: Idbc3e9a0cec91c4c6e01916b967b6237645ebe59 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/net/sock.h | 7 +++++++ net/core/sock.c | 5 ++++- net/socket.c | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 2d663ee8494d..5d8e0049b71c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -446,6 +446,7 @@ struct sock { void *sk_security; #endif __u32 sk_mark; + kuid_t sk_uid; #ifdef CONFIG_CGROUP_NET_CLASSID u32 sk_classid; #endif @@ -1682,6 +1683,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) sk->sk_wq = parent->wq; parent->sk = sk; sk_set_socket(sk, parent); + sk->sk_uid = SOCK_INODE(parent)->i_uid; security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } @@ -1689,6 +1691,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) kuid_t sock_i_uid(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); +static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) +{ + return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); +} + static inline u32 net_tx_rndhash(void) { u32 v = prandom_u32(); diff --git a/net/core/sock.c b/net/core/sock.c index 0d91f7dca751..d0f83260cddd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2383,8 +2383,11 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_type = sock->type; sk->sk_wq = sock->wq; sock->sk = sk; - } else + sk->sk_uid = SOCK_INODE(sock)->i_uid; + } else { sk->sk_wq = NULL; + sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); + } rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, diff --git a/net/socket.c b/net/socket.c index 263b334ec5e4..1012991fb560 100644 --- a/net/socket.c +++ b/net/socket.c @@ -520,9 +520,23 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, return used; } +int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + int err = simple_setattr(dentry, iattr); + + if (!err) { + struct socket *sock = SOCKET_I(d_inode(dentry)); + + sock->sk->sk_uid = iattr->ia_uid; + } + + return err; +} + static const struct inode_operations sockfs_inode_ops = { .getxattr = sockfs_getxattr, .listxattr = sockfs_listxattr, + .setattr = sockfs_setattr, }; /** From 03441d56d878c40acd8e595548a68996199c2135 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 4 Nov 2016 02:23:42 +0900 Subject: [PATCH 0579/3220] net: core: add UID to flows, rules, and routes - Define a new FIB rule attributes, FRA_UID_RANGE, to describe a range of UIDs. - Define a RTA_UID attribute for per-UID route lookups and dumps. - Support passing these attributes to and from userspace via rtnetlink. The value INVALID_UID indicates no UID was specified. - Add a UID field to the flow structures. Bug: 16355602 Change-Id: Iea98e6fedd0fd4435a1f4efa3deb3629505619ab Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/net/fib_rules.h | 9 ++++- include/net/flow.h | 5 +++ include/uapi/linux/fib_rules.h | 8 ++++ include/uapi/linux/rtnetlink.h | 3 ++ net/core/fib_rules.c | 72 +++++++++++++++++++++++++++++++++- net/ipv4/fib_frontend.c | 1 + net/ipv4/route.c | 11 ++++++ net/ipv6/route.c | 7 ++++ 8 files changed, 114 insertions(+), 2 deletions(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 59160de702b6..bd2b5c007561 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -8,6 +8,11 @@ #include #include +struct fib_kuid_range { + kuid_t start; + kuid_t end; +}; + struct fib_rule { struct list_head list; int iifindex; @@ -29,6 +34,7 @@ struct fib_rule { int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; + struct fib_kuid_range uid_range; struct rcu_head rcu; }; @@ -89,7 +95,8 @@ struct fib_rules_ops { [FRA_TABLE] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ - [FRA_GOTO] = { .type = NLA_U32 } + [FRA_GOTO] = { .type = NLA_U32 }, \ + [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) } static inline void fib_rule_get(struct fib_rule *rule) { diff --git a/include/net/flow.h b/include/net/flow.h index 83969eebebf3..8913962d7d25 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * ifindex generation is per-net namespace, and loopback is @@ -38,6 +39,7 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x08 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; + kuid_t flowic_uid; }; union flowi_uli { @@ -75,6 +77,7 @@ struct flowi4 { #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key +#define flowi4_uid __fl_common.flowic_uid /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -133,6 +136,7 @@ struct flowi6 { #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key +#define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; __be32 flowlabel; @@ -177,6 +181,7 @@ struct flowi { #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key +#define flowi_uid u.__fl_common.flowic_uid } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 96161b8202b5..bbf02a63a011 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -29,6 +29,11 @@ struct fib_rule_hdr { __u32 flags; }; +struct fib_rule_uid_range { + __u32 start; + __u32 end; +}; + enum { FRA_UNSPEC, FRA_DST, /* destination address */ @@ -49,6 +54,9 @@ enum { FRA_TABLE, /* Extended table id */ FRA_FWMASK, /* mask for netfilter mark */ FRA_OIFNAME, + FRA_PAD, + FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ + FRA_UID_RANGE, /* UID range */ __FRA_MAX }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 123a5af4e8bb..d66101789bfd 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -311,6 +311,9 @@ enum rtattr_type_t { RTA_PREF, RTA_ENCAP_TYPE, RTA_ENCAP, + RTA_EXPIRES, + RTA_PAD, + RTA_UID, __RTA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 365de66436ac..cb744a352167 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -18,6 +18,11 @@ #include #include +static const struct fib_kuid_range fib_kuid_range_unset = { + KUIDT_INIT(0), + KUIDT_INIT(~0), +}; + int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) { @@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->table = table; r->flags = flags; r->fr_net = ops->fro_net; + r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; @@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops) } EXPORT_SYMBOL_GPL(fib_rules_unregister); +static int uid_range_set(struct fib_kuid_range *range) +{ + return uid_valid(range->start) && uid_valid(range->end); +} + +static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) +{ + struct fib_rule_uid_range *in; + struct fib_kuid_range out; + + in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); + + out.start = make_kuid(current_user_ns(), in->start); + out.end = make_kuid(current_user_ns(), in->end); + + return out; +} + +static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) +{ + struct fib_rule_uid_range out = { + from_kuid_munged(current_user_ns(), range->start), + from_kuid_munged(current_user_ns(), range->end) + }; + + return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags) { @@ -189,6 +223,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; + if (uid_lt(fl->flowi_uid, rule->uid_range.start) || + uid_gt(fl->flowi_uid, rule->uid_range.end)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -371,6 +409,21 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) } else if (rule->action == FR_ACT_GOTO) goto errout_free; + if (tb[FRA_UID_RANGE]) { + if (current_user_ns() != net->user_ns) { + err = -EPERM; + goto errout_free; + } + + rule->uid_range = nla_get_kuid_range(tb); + + if (!uid_range_set(&rule->uid_range) || + !uid_lte(rule->uid_range.start, rule->uid_range.end)) + goto errout_free; + } else { + rule->uid_range = fib_kuid_range_unset; + } + err = ops->configure(rule, skb, frh, tb); if (err < 0) goto errout_free; @@ -432,6 +485,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *tmp; struct nlattr *tb[FRA_MAX+1]; + struct fib_kuid_range range; int err = -EINVAL; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) @@ -451,6 +505,14 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (err < 0) goto errout; + if (tb[FRA_UID_RANGE]) { + range = nla_get_kuid_range(tb); + if (!uid_range_set(&range)) + goto errout; + } else { + range = fib_kuid_range_unset; + } + list_for_each_entry(rule, &ops->rules_list, list) { if (frh->action && (frh->action != rule->action)) continue; @@ -483,6 +545,11 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) continue; + if (uid_range_set(&range) && + (!uid_eq(rule->uid_range.start, range.start) || + !uid_eq(rule->uid_range.end, range.end))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -550,6 +617,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size(8); /* FRA_TUN_ID */ + + nla_total_size(sizeof(struct fib_kuid_range)); if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -607,7 +675,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && - nla_put_be64(skb, FRA_TUN_ID, rule->tun_id))) + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id)) || + (uid_range_set(&rule->uid_range) && + nla_put_uid_range(skb, &rule->uid_range))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 473447593060..e10edb5e78b0 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -629,6 +629,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, + [RTA_UID] = { .type = NLA_U32 }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 85f184e429c6..25ca8d4b6565 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2422,6 +2422,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; + if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && + nla_put_u32(skb, RTA_UID, + from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) + goto nla_put_failure; + error = rt->dst.error; if (rt_is_input_route(rt)) { @@ -2473,6 +2478,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) int mark; struct sk_buff *skb; u32 table_id = RT_TABLE_MAIN; + kuid_t uid; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) @@ -2500,6 +2506,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + if (tb[RTA_UID]) + uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); + else + uid = (iif ? INVALID_UID : current_uid()); memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; @@ -2507,6 +2517,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_tos = rtm->rtm_tos; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; + fl4.flowi4_uid = uid; if (netif_index_is_l3_master(net, fl4.flowi4_oif)) fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 60359bea6a16..a1297c41d147 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2685,6 +2685,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_PREF] = { .type = NLA_U8 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, + [RTA_UID] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3246,6 +3247,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) if (tb[RTA_MARK]) fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); + if (tb[RTA_UID]) + fl6.flowi6_uid = make_kuid(current_user_ns(), + nla_get_u32(tb[RTA_UID])); + else + fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); + if (iif) { struct net_device *dev; int flags = 0; From 344afd627cca0223464079135926f611fdbb0574 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 4 Nov 2016 02:23:43 +0900 Subject: [PATCH 0580/3220] net: inet: Support UID-based routing in IP protocols. - Use the UID in routing lookups made by protocol connect() and sendmsg() functions. - Make sure that routing lookups triggered by incoming packets (e.g., Path MTU discovery) take the UID of the socket into account. - For packets not associated with a userspace socket, (e.g., ping replies) use UID 0 inside the user namespace corresponding to the network namespace the socket belongs to. This allows all namespaces to apply routing and iptables rules to kernel-originated traffic in that namespaces by matching UID 0. This is better than using the UID of the kernel socket that is sending the traffic, because the UID of kernel sockets created at namespace creation time (e.g., the per-processor ICMP and TCP sockets) is the UID of the user that created the socket, which might not be mapped in the namespace. Bug: 16355602 Change-Id: I910504b508948057912bc188fd1e8aca28294de3 Tested: compiles allnoconfig, allyesconfig, allmodconfig Tested: https://android-review.googlesource.com/253302 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/net/flow.h | 4 +++- include/net/ip.h | 1 + include/net/ip6_route.h | 5 +++-- include/net/route.h | 5 +++-- net/ipv4/icmp.c | 2 ++ net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_output.c | 3 ++- net/ipv4/ping.c | 3 ++- net/ipv4/raw.c | 2 +- net/ipv4/route.c | 26 +++++++++++++++----------- net/ipv4/syncookies.c | 2 +- net/ipv4/tcp_ipv4.c | 11 +++++++---- net/ipv4/udp.c | 3 ++- net/ipv6/af_inet6.c | 1 + net/ipv6/ah6.c | 5 +++-- net/ipv6/datagram.c | 1 + net/ipv6/esp6.c | 5 +++-- net/ipv6/icmp.c | 7 +++++-- net/ipv6/inet6_connection_sock.c | 2 ++ net/ipv6/ip6_gre.c | 4 ++++ net/ipv6/ip6_tunnel.c | 3 +++ net/ipv6/ip6_vti.c | 5 +++-- net/ipv6/ipcomp6.c | 5 +++-- net/ipv6/netfilter.c | 1 + net/ipv6/ping.c | 1 + net/ipv6/raw.c | 1 + net/ipv6/route.c | 13 +++++++++---- net/ipv6/syncookies.c | 1 + net/ipv6/tcp_ipv6.c | 2 ++ net/ipv6/udp.c | 1 + net/l2tp/l2tp_ip6.c | 1 + 31 files changed, 89 insertions(+), 41 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index 8913962d7d25..833080732dec 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -97,7 +97,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, __u32 mark, __u8 tos, __u8 scope, __u8 proto, __u8 flags, __be32 daddr, __be32 saddr, - __be16 dport, __be16 sport) + __be16 dport, __be16 sport, + kuid_t uid) { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; @@ -108,6 +109,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; + fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; diff --git a/include/net/ip.h b/include/net/ip.h index 1a98f1ca1638..4f3ef345f4c2 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -170,6 +170,7 @@ struct ip_reply_arg { /* -1 if not needed */ int bound_dev_if; u8 tos; + kuid_t uid; }; #define IP_REPLY_ARG_NOSRCCHECK 1 diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 877f682989b8..55eea0bd2010 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -108,9 +108,10 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr); void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, - u32 mark); + u32 mark, kuid_t uid); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu); -void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark); +void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, + kuid_t uid); void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, u32 mark); void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk); diff --git a/include/net/route.h b/include/net/route.h index a3b9ef74a389..3adb9c724818 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -154,7 +154,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos, RT_SCOPE_UNIVERSE, proto, sk ? inet_sk_flowi_flags(sk) : 0, - daddr, saddr, dport, sport); + daddr, saddr, dport, sport, sock_net_uid(net, sk)); if (sk) security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); return ip_route_output_flow(net, fl4, sk); @@ -267,7 +267,8 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 flow_flags |= FLOWI_FLAG_ANYSRC; flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - protocol, flow_flags, dst, src, dport, sport); + protocol, flow_flags, dst, src, dport, sport, + sk->sk_uid); } static inline struct rtable *ip_route_connect(struct flowi4 *fl4, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 36e26977c908..ef2d4322aba7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -425,6 +425,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_mark = mark; + fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); @@ -473,6 +474,7 @@ static struct rtable *icmp_route_lookup(struct net *net, param->replyopts.opt.opt.faddr : iph->saddr); fl4->saddr = saddr; fl4->flowi4_mark = mark; + fl4->flowi4_uid = sock_net_uid(net, NULL); fl4->flowi4_tos = RT_TOS(tos); fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 46b9c887bede..7e4a9c1fb615 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -420,7 +420,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num)); + htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -457,7 +457,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num)); + htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4233cbe47052..33bef2763c72 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1573,7 +1573,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, - tcp_hdr(skb)->source, tcp_hdr(skb)->dest); + tcp_hdr(skb)->source, tcp_hdr(skb)->dest, + arg->uid); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index e89094ab5ddb..d5d3f0f5e5ad 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -789,7 +789,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); + inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, + sk->sk_uid); security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bc35f1842512..fbf73f0130a7 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -599,7 +599,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), - daddr, saddr, 0, 0); + daddr, saddr, 0, 0, sk->sk_uid); if (!saddr && ipc.oif) { err = l3mdev_get_saddr(net, ipc.oif, &fl4); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 25ca8d4b6565..9ed5f69f84e2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -500,7 +500,8 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) } EXPORT_SYMBOL(__ip_select_ident); -static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, +static void __build_flow_key(const struct net *net, struct flowi4 *fl4, + const struct sock *sk, const struct iphdr *iph, int oif, u8 tos, u8 prot, u32 mark, int flow_flags) @@ -516,7 +517,8 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, flowi4_init_output(fl4, oif, mark, tos, RT_SCOPE_UNIVERSE, prot, flow_flags, - iph->daddr, iph->saddr, 0, 0); + iph->daddr, iph->saddr, 0, 0, + sock_net_uid(net, sk)); } static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, @@ -528,7 +530,7 @@ static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, u8 prot = iph->protocol; u32 mark = skb->mark; - __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); + __build_flow_key(sock_net(sk), fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) @@ -545,7 +547,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), - daddr, inet->inet_saddr, 0, 0); + daddr, inet->inet_saddr, 0, 0, sk->sk_uid); rcu_read_unlock(); } @@ -793,7 +795,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf rt = (struct rtable *) dst; - __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0); + __build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } @@ -1011,7 +1013,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, if (!mark) mark = IP4_REPLY_MARK(net, skb->mark); - __build_flow_key(&fl4, NULL, iph, oif, + __build_flow_key(net, &fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { @@ -1027,7 +1029,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct flowi4 fl4; struct rtable *rt; - __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); if (!fl4.flowi4_mark) fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); @@ -1046,6 +1048,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct rtable *rt; struct dst_entry *odst = NULL; bool new = false; + struct net *net = sock_net(sk); bh_lock_sock(sk); @@ -1059,7 +1062,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) goto out; } - __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = (struct rtable *)odst; if (odst->obsolete && !odst->ops->check(odst, 0)) { @@ -1099,7 +1102,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net, struct flowi4 fl4; struct rtable *rt; - __build_flow_key(&fl4, NULL, iph, oif, + __build_flow_key(net, &fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { @@ -1114,9 +1117,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; + struct net *net = sock_net(sk); - __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - rt = __ip_route_output_key(sock_net(sk), &fl4); + __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 4cbe9f0a4281..2dc982b15df8 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -375,7 +375,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, th->source, th->dest); + ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 205e6745393f..cf1efa823af6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -684,6 +684,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.bound_dev_if = sk->sk_bound_dev_if; arg.tos = ip_hdr(skb)->tos; + arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, @@ -705,7 +706,8 @@ release_sk1: outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, +static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb, + u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int reply_flags, u8 tos) @@ -720,7 +722,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ]; } rep; struct ip_reply_arg arg; - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = sock_net(sk); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -769,6 +771,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; + arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, @@ -782,7 +785,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, @@ -801,7 +804,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? + tcp_v4_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f8b3842b9070..8acf544794a1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1023,7 +1023,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, - faddr, saddr, dport, inet->inet_sport); + faddr, saddr, dport, inet->inet_sport, + sk->sk_uid); if (!saddr && ipc.oif) { err = l3mdev_get_saddr(net, ipc.oif, fl4); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9dbfacb6e0d9..1604163c2850 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -678,6 +678,7 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; + fl6.flowi6_uid = sk->sk_uid; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); rcu_read_lock(); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 0630a4d5daaa..189eb10b742d 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -662,9 +662,10 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, skb->dev->ifindex, 0); + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); else - ip6_update_pmtu(skb, net, info, 0, 0); + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 517c55b01ba8..b0fa0dcf33d5 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -161,6 +161,7 @@ ipv4_connected: fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; + fl6.flowi6_uid = sk->sk_uid; if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) fl6.flowi6_oif = np->mcast_oif; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 060a60b2f8a6..218f0cba231c 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -474,9 +474,10 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, skb->dev->ifindex, 0); + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); else - ip6_update_pmtu(skb, net, info, 0, 0); + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 3697cd08c515..3ae2fbe07b25 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -92,9 +92,10 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct net *net = dev_net(skb->dev); if (type == ICMPV6_PKT_TOOBIG) - ip6_update_pmtu(skb, net, info, 0, 0); + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); else if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, skb->dev->ifindex, 0); + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) @@ -478,6 +479,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; + fl6.flowi6_uid = sock_net_uid(net, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); @@ -585,6 +587,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.flowi6_oif = l3mdev_fib_oif(skb->dev); fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; + fl6.flowi6_uid = sock_net_uid(net, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index a7ca2cde2ecb..dc79ebc14189 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -86,6 +86,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_mark = ireq->ir_mark; fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); + fl6->flowi6_uid = sk->sk_uid; security_req_classify_flow(req, flowi6_to_flowi(fl6)); dst = ip6_dst_lookup_flow(sk, fl6, final_p); @@ -134,6 +135,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->flowi6_mark = sk->sk_mark; fl6->fl6_sport = inet->inet_sport; fl6->fl6_dport = inet->inet_dport; + fl6->flowi6_uid = sk->sk_uid; security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); rcu_read_lock(); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index e5ea177d34c6..ae8510a5f604 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -792,6 +792,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; + fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ @@ -842,6 +844,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; + fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); if (err != 0) { if (err == -EMSGSIZE) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 137fca42aaa6..fec9b8c5622c 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1190,6 +1190,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_IPIP; + fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + dsfield = ipv4_get_dsfield(iph); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) @@ -1243,6 +1245,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_IPV6; + fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 0a8610b33d79..24fb9c0efd00 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -597,9 +597,10 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, skb->dev->ifindex, 0); + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); else - ip6_update_pmtu(skb, net, info, 0, 0); + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 1b9316e1386a..54d165b9845a 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -74,9 +74,10 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, skb->dev->ifindex, 0); + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); else - ip6_update_pmtu(skb, net, info, 0, 0); + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index d11c46833d61..39970e212ad5 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -26,6 +26,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb) struct flowi6 fl6 = { .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, .flowi6_mark = skb->mark, + .flowi6_uid = sock_net_uid(net, skb->sk), .daddr = iph->daddr, .saddr = iph->saddr, }; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index a5cf82ccf406..dc338f876514 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -141,6 +141,7 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.daddr = *daddr; fl6.flowi6_oif = oif; fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_uid = sk->sk_uid; fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 99140986e887..58eb8ee19f34 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -768,6 +768,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_uid = sk->sk_uid; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a1297c41d147..30835d96278e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1383,7 +1383,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, } void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, - int oif, u32 mark) + int oif, u32 mark, kuid_t uid) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; @@ -1395,6 +1395,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); + fl6.flowi6_uid = uid; dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) @@ -1406,7 +1407,7 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { ip6_update_pmtu(skb, sock_net(sk), mtu, - sk->sk_bound_dev_if, sk->sk_mark); + sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); @@ -1487,7 +1488,8 @@ static struct dst_entry *ip6_route_redirect(struct net *net, flags, __ip6_route_redirect); } -void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) +void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, + kuid_t uid) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; @@ -1500,6 +1502,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); + fl6.flowi6_uid = uid; dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); @@ -1521,6 +1524,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, fl6.flowi6_mark = mark; fl6.daddr = msg->dest; fl6.saddr = iph->daddr; + fl6.flowi6_uid = sock_net_uid(net, NULL); dst = ip6_route_redirect(net, &fl6, &iph->saddr); rt6_do_redirect(dst, NULL, skb); @@ -1529,7 +1533,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { - ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark); + ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, + sk->sk_uid); } EXPORT_SYMBOL_GPL(ip6_sk_redirect); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index eaf7ac496d50..336843ca4e6b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -228,6 +228,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; + fl6.flowi6_uid = sk->sk_uid; security_req_classify_flow(req, flowi6_to_flowi(&fl6)); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f6eb65a5d343..1f5505ecf7f7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -234,6 +234,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; + fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); final_p = fl6_update_dst(&fl6, opt, &final); @@ -810,6 +811,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; + fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); /* Pass a socket to ip6_dst_lookup either it is for RST diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index fcce7b4ba8ff..0890fd6d4248 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1243,6 +1243,7 @@ do_udp_sendmsg: fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_uid = sk->sk_uid; if (msg->msg_controllen) { opt = &opt_space; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index a2c8747d2936..c14c59f18c59 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -515,6 +515,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_uid = sk->sk_uid; if (lsa) { if (addr_len < SIN6_LEN_RFC2133) From 341965cf103dc4d60a829e9a55941c618a81358f Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 30 Nov 2016 02:56:47 +0900 Subject: [PATCH 0581/3220] net: ipv4: Don't crash if passing a null sk to ip_rt_update_pmtu. Commit e2d118a1cb5e ("net: inet: Support UID-based routing in IP protocols.") made __build_flow_key call sock_net(sk) to determine the network namespace of the passed-in socket. This crashes if sk is NULL. Fix this by getting the network namespace from the skb instead. Bug: 16355602 Change-Id: I27161b70f448bb95adce3994a97920d54987ce4e Fixes: e2d118a1cb5e ("net: inet: Support UID-based routing in IP protocols.") Reported-by: Erez Shitrit Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9ed5f69f84e2..ad5ce6652623 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -524,13 +524,14 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4, static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, const struct sock *sk) { + const struct net *net = dev_net(skb->dev); const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; u8 tos = RT_TOS(iph->tos); u8 prot = iph->protocol; u32 mark = skb->mark; - __build_flow_key(sock_net(sk), fl4, sk, iph, oif, tos, prot, mark, 0); + __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) From 0aac1fcfa327de74251daf85a496e3e863f6c004 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Tue, 20 Dec 2016 11:08:34 -0800 Subject: [PATCH 0582/3220] ANDROID: android-base: Enable QUOTA related configs Bug: 33757366 Change-Id: Iec4f55c3ca4a16dbc8695054f481d9261c56d0f6 --- android/configs/android-base.cfg | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 8531a7a79e33..f10371a981b7 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -139,7 +139,11 @@ CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y CONFIG_PROFILING=y +CONFIG_QFMT_V2=y CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +CONFIG_QUOTA_TREE=y +CONFIG_QUOTACTL=y CONFIG_RANDOMIZE_BASE=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y From 676b8efcce52d582153d3df3bc90689cab07a058 Mon Sep 17 00:00:00 2001 From: mukesh agrawal Date: Tue, 12 Jul 2016 11:28:05 -0700 Subject: [PATCH 0583/3220] ANDROID: trace: net: use %pK for kernel pointers We want to use network trace events in production builds, to help diagnose Wifi problems. However, we don't want to expose raw kernel pointers in such builds. Change the format specifier for the skbaddr field, so that, if kptr_restrict is enabled, the pointers will be reported as 0. Bug: 30090733 Change-Id: Ic4bd583d37af6637343601feca875ee24479ddff Signed-off-by: mukesh agrawal --- include/trace/events/net.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/net.h b/include/trace/events/net.h index 49cc7c3de252..89d009e10938 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -57,7 +57,7 @@ TRACE_EVENT(net_dev_start_xmit, __entry->gso_type = skb_shinfo(skb)->gso_type; ), - TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x", + TP_printk("dev=%s queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x", __get_str(name), __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->len, @@ -90,7 +90,7 @@ TRACE_EVENT(net_dev_xmit, __assign_str(name, dev->name); ), - TP_printk("dev=%s skbaddr=%p len=%u rc=%d", + TP_printk("dev=%s skbaddr=%pK len=%u rc=%d", __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) ); @@ -112,7 +112,7 @@ DECLARE_EVENT_CLASS(net_dev_template, __assign_str(name, skb->dev->name); ), - TP_printk("dev=%s skbaddr=%p len=%u", + TP_printk("dev=%s skbaddr=%pK len=%u", __get_str(name), __entry->skbaddr, __entry->len) ) @@ -191,7 +191,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, __entry->gso_type = skb_shinfo(skb)->gso_type; ), - TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x", + TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x", __get_str(name), __entry->napi_id, __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, From ee2bcbd8eddaca598b47284d3c52b593a9d09d7e Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 2 Jan 2017 20:18:05 +0530 Subject: [PATCH 0584/3220] ANDROID: sched/walt: fix build failure if FAIR_GROUP_SCHED=n Fix SCHED_WALT dependency on FAIR_GROUP_SCHED otherwise we run into following build failure: CC kernel/sched/walt.o kernel/sched/walt.c: In function 'walt_inc_cfs_cumulative_runnable_avg': kernel/sched/walt.c:148:8: error: 'struct cfs_rq' has no member named 'cumulative_runnable_avg' cfs_rq->cumulative_runnable_avg += p->ravg.demand; ^ kernel/sched/walt.c: In function 'walt_dec_cfs_cumulative_runnable_avg': kernel/sched/walt.c:154:8: error: 'struct cfs_rq' has no member named 'cumulative_runnable_avg' cfs_rq->cumulative_runnable_avg -= p->ravg.demand; ^ Reported-at: https://bugs.linaro.org/show_bug.cgi?id=2793 Signed-off-by: Amit Pundir --- init/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/init/Kconfig b/init/Kconfig index acb6645ffda5..445af1262134 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -395,6 +395,7 @@ endchoice config SCHED_WALT bool "Support window based load tracking" depends on SMP + depends on FAIR_GROUP_SCHED help This feature will allow the scheduler to maintain a tunable window based set of metrics for tasks and runqueues. These metrics can be From 9ce0ba91cecce1377d9fca96eb097c428a44054f Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 14 Dec 2016 12:31:55 -0800 Subject: [PATCH 0585/3220] Revert "FROMLIST: arm64: Enable CONFIG_ARM64_SW_TTBR0_PAN" This reverts commit 67cd3bda54dadba4f8892105adf9c2f3982bfa0a. Bug: 31432001 Change-Id: I1e5836ce0b41b2262d95c5c4c49ace3b96ae0b1f Signed-off-by: Sami Tolvanen --- arch/arm64/Kconfig | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a4d3c3cd7b23..095a3afb1e9d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -706,14 +706,6 @@ config SETEND_EMULATION If unsure, say Y endif -config ARM64_SW_TTBR0_PAN - bool "Emulate Priviledged Access Never using TTBR0_EL1 switching" - help - Enabling this option prevents the kernel from accessing - user-space memory directly by pointing TTBR0_EL1 to a reserved - zeroed area and reserved ASID. The user access routines - restore the valid TTBR0_EL1 temporarily. - menu "ARMv8.1 architectural features" config ARM64_HW_AFDBM From c558527eedcd5e166fb91db26ed183e5b88e036f Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 14 Dec 2016 12:32:07 -0800 Subject: [PATCH 0586/3220] Revert "FROMLIST: arm64: xen: Enable user access before a privcmd hvc call" This reverts commit 4dbc88bd2b6a74fd33483ee2593dcf2bd858eabe. Bug: 31432001 Change-Id: I2c3d591a2c631e7ff02c0bcb91624735e8c12f0a Signed-off-by: Sami Tolvanen --- arch/arm64/xen/hypercall.S | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 6d6e4af1a4bf..8bbe9401f4f0 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -49,7 +49,6 @@ #include #include -#include #include @@ -90,24 +89,6 @@ ENTRY(privcmd_call) mov x2, x3 mov x3, x4 mov x4, x5 -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Privcmd calls are issued by the userspace. The kernel needs to - * enable access to TTBR0_EL1 as the hypervisor would issue stage 1 - * translations to user memory via AT instructions. Since AT - * instructions are not affected by the PAN bit (ARMv8.1), we only - * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation - * is enabled (it implies that hardware UAO and PAN disabled). - */ - uaccess_enable_not_uao x6, x7 -#endif hvc XEN_IMM - -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Disable userspace access from kernel once the hyp call completed. - */ - uaccess_disable_not_uao x6 -#endif ret ENDPROC(privcmd_call); From 1f150a5e5a9630492742def51ebbf665051fbada Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 14 Dec 2016 12:32:16 -0800 Subject: [PATCH 0587/3220] Revert "FROMLIST: arm64: Handle faults caused by inadvertent user access with PAN enabled" This reverts commit 5dc2b7c7bb33138270ff9494be6cf334bd3d20e1. Bug: 31432001 Change-Id: I384a9af199f502f8fa3ae3733db67a4c547dbd55 Signed-off-by: Sami Tolvanen --- arch/arm64/mm/fault.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9cedb10b1107..0aacbd763e6b 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -246,19 +246,13 @@ out: return fault; } -static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs) +static inline bool is_permission_fault(unsigned int esr) { unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) - return false; - - if (system_uses_ttbr0_pan()) - return fsc_type == ESR_ELx_FSC_FAULT && - (regs->pstate & PSR_PAN_BIT); - else - return fsc_type == ESR_ELx_FSC_PERM; + return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) || + (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM); } static bool is_el0_instruction_abort(unsigned int esr) @@ -299,7 +293,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (addr < USER_DS && is_permission_fault(esr, regs)) { + if (is_permission_fault(esr) && (addr < USER_DS)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); @@ -484,10 +478,10 @@ static const struct fault_info { { do_bad, SIGBUS, 0, "unknown 17" }, { do_bad, SIGBUS, 0, "unknown 18" }, { do_bad, SIGBUS, 0, "unknown 19" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, { do_bad, SIGBUS, 0, "synchronous parity error" }, { do_bad, SIGBUS, 0, "unknown 25" }, { do_bad, SIGBUS, 0, "unknown 26" }, From 77a4773a3b6e5b031ad3ecb68c217c45ed964fa3 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 14 Dec 2016 12:32:25 -0800 Subject: [PATCH 0588/3220] Revert "FROMLIST: arm64: Disable TTBR0_EL1 during normal kernel execution" This reverts commit 5775ca34829caf0664c8ccc02fd0e93cb6022e0f. Bug: 31432001 Change-Id: I9b07c2f01bc2bcfed51f60ab487034639f5e1960 Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/efi.h | 26 +---------- arch/arm64/include/asm/mmu_context.h | 51 ++++++--------------- arch/arm64/include/asm/ptrace.h | 2 - arch/arm64/kernel/entry.S | 67 ---------------------------- arch/arm64/kernel/setup.c | 9 ---- arch/arm64/mm/context.c | 7 +-- 6 files changed, 16 insertions(+), 146 deletions(-) diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 932f5a56d1a6..8e88a696c9cb 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -1,7 +1,6 @@ #ifndef _ASM_EFI_H #define _ASM_EFI_H -#include #include #include #include @@ -70,30 +69,7 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); static inline void efi_set_pgd(struct mm_struct *mm) { - __switch_mm(mm); - - if (system_uses_ttbr0_pan()) { - if (mm != current->active_mm) { - /* - * Update the current thread's saved ttbr0 since it is - * restored as part of a return from exception. Set - * the hardware TTBR0_EL1 using cpu_switch_mm() - * directly to enable potential errata workarounds. - */ - update_saved_ttbr0(current, mm); - cpu_switch_mm(mm->pgd, mm); - } else { - /* - * Defer the switch to the current thread's TTBR0_EL1 - * until uaccess_enable(). Restore the current - * thread's saved ttbr0 corresponding to its active_mm - * (if different from init_mm). - */ - cpu_set_reserved_ttbr0(); - if (current->active_mm != &init_mm) - update_saved_ttbr0(current, current->active_mm); - } - } + switch_mm(NULL, mm, NULL); } void efi_virtmap_load(void); diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 4a32fd5f101d..a00f7cf35bbd 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -23,7 +23,6 @@ #include #include -#include #include #include #include @@ -114,7 +113,7 @@ static inline void cpu_uninstall_idmap(void) local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); - if (mm != &init_mm && !system_uses_ttbr0_pan()) + if (mm != &init_mm) cpu_switch_mm(mm->pgd, mm); } @@ -174,27 +173,21 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { } -#ifdef CONFIG_ARM64_SW_TTBR0_PAN -static inline void update_saved_ttbr0(struct task_struct *tsk, - struct mm_struct *mm) -{ - if (system_uses_ttbr0_pan()) { - BUG_ON(mm->pgd == swapper_pg_dir); - task_thread_info(tsk)->ttbr0 = - virt_to_phys(mm->pgd) | ASID(mm) << 48; - } -} -#else -static inline void update_saved_ttbr0(struct task_struct *tsk, - struct mm_struct *mm) -{ -} -#endif - -static inline void __switch_mm(struct mm_struct *next) +/* + * This is the actual mm switch as far as the scheduler + * is concerned. No registers are touched. We avoid + * calling the CPU specific function when the mm hasn't + * actually changed. + */ +static inline void +switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) { unsigned int cpu = smp_processor_id(); + if (prev == next) + return; + /* * init_mm.pgd does not contain any user mappings and it is always * active for kernel addresses in TTBR1. Just set the reserved TTBR0. @@ -207,23 +200,7 @@ static inline void __switch_mm(struct mm_struct *next) check_and_switch_context(next, cpu); } -static inline void -switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - if (prev != next) - __switch_mm(next); - - /* - * Update the saved TTBR0_EL1 of the scheduled-in task as the previous - * value may have not been initialised yet (activate_mm caller) or the - * ASID has changed since the last run (following the context switch - * of another thread of the same process). - */ - update_saved_ttbr0(tsk, next); -} - #define deactivate_mm(tsk,mm) do { } while (0) -#define activate_mm(prev,next) switch_mm(prev, next, current) +#define activate_mm(prev,next) switch_mm(prev, next, NULL) #endif diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index dd4257e286b1..200d5c32b38f 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -21,8 +21,6 @@ #include -#define _PSR_PAN_BIT 22 - /* Current Exception Level values, as contained in CurrentEL */ #define CurrentEL_EL1 (1 << 2) #define CurrentEL_EL2 (2 << 2) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index eb185c93dfc6..8eb8eb085036 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,9 +29,7 @@ #include #include #include -#include #include -#include #include /* @@ -111,34 +109,6 @@ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] - -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Set the TTBR0 PAN bit in SPSR. When the exception is taken from - * EL0, there is no need to check the state of TTBR0_EL1 since - * accesses are always enabled. - * Note that the meaning of this bit differs from the ARMv8.1 PAN - * feature as all TTBR0_EL1 accesses are disabled, not just those to - * user mappings. - */ -alternative_if_not ARM64_HAS_PAN - nop -alternative_else - b 1f // skip TTBR0 PAN -alternative_endif - - .if \el != 0 - mrs x21, ttbr0_el1 - tst x21, #0xffff << 48 // Check for the reserved ASID - orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR - b.eq 1f // TTBR0 access already disabled - and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR - .endif - - uaccess_ttbr0_disable x21 -1: -#endif - stp x22, x23, [sp, #S_PC] /* @@ -177,42 +147,6 @@ alternative_endif ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter - .endif - -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR - * PAN bit checking. - */ -alternative_if_not ARM64_HAS_PAN - nop -alternative_else - b 2f // skip TTBR0 PAN -alternative_endif - - .if \el != 0 - tbnz x22, #_PSR_PAN_BIT, 1f // Skip re-enabling TTBR0 access if previously disabled - .endif - - uaccess_ttbr0_enable x0 - - .if \el == 0 - /* - * Enable errata workarounds only if returning to user. The only - * workaround currently required for TTBR0_EL1 changes are for the - * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache - * corruption). - */ - post_ttbr0_update_workaround - .endif -1: - .if \el != 0 - and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit - .endif -2: -#endif - - .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 #ifdef CONFIG_ARM64_ERRATUM_845719 @@ -234,7 +168,6 @@ alternative_else alternative_endif #endif .endif - msr elr_el1, x21 // set up the return data msr spsr_el1, x22 ldp x0, x1, [sp, #16 * 0] diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 6591bf23422b..29b8c247d56f 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -347,15 +347,6 @@ void __init setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Make sure init_thread_info.ttbr0 always generates translation - * faults in case uaccess_enable() is inadvertently called by the init - * thread. - */ - init_thread_info.ttbr0 = virt_to_phys(empty_zero_page); -#endif - #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 25128089c386..7275628ba59f 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -182,12 +182,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: - /* - * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when - * emulating PAN. - */ - if (!system_uses_ttbr0_pan()) - cpu_switch_mm(mm->pgd, mm); + cpu_switch_mm(mm->pgd, mm); } static int asids_init(void) From b4ae8c8946b279fd142b8f85b0631606bc714a42 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 14 Dec 2016 12:32:37 -0800 Subject: [PATCH 0589/3220] Revert "FROMLIST: arm64: Introduce uaccess_{disable,enable} functionality based on TTBR0_EL1" This reverts commit 1911d36b27ba58ee18592df25b7ee636d4d4c41d. Bug: 31432001 Change-Id: Iee77eed8454f379b948dbbaf65c105952ea30bef Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 16 ----- arch/arm64/include/asm/cpufeature.h | 6 -- arch/arm64/include/asm/kernel-pgtable.h | 7 -- arch/arm64/include/asm/thread_info.h | 3 - arch/arm64/include/asm/uaccess.h | 96 ++----------------------- arch/arm64/kernel/asm-offsets.c | 3 - arch/arm64/kernel/cpufeature.c | 1 - arch/arm64/kernel/entry.S | 4 ++ arch/arm64/kernel/head.S | 6 +- arch/arm64/kernel/vmlinux.lds.S | 5 -- 10 files changed, 13 insertions(+), 134 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index aeb4554b3af3..9d3e77a5cf07 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -53,15 +53,6 @@ msr daifclr, #2 .endm - .macro save_and_disable_irq, flags - mrs \flags, daif - msr daifset, #2 - .endm - - .macro restore_irq, flags - msr daif, \flags - .endm - /* * Enable and disable debug exceptions. */ @@ -371,13 +362,6 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm -/* - * Return the current thread_info. - */ - .macro get_thread_info, rd - mrs \rd, sp_el0 - .endm - /* * Errata workaround post TTBR0_EL1 update. */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index f125c03ab2e1..727e594ac5c2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -188,12 +188,6 @@ static inline bool system_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1)); } -static inline bool system_uses_ttbr0_pan(void) -{ - return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && - !cpus_have_cap(ARM64_HAS_PAN); -} - #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 7803343e5881..7e51d1b57c0c 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -19,7 +19,6 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H -#include #include /* @@ -55,12 +54,6 @@ #define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE) #define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) -#ifdef CONFIG_ARM64_SW_TTBR0_PAN -#define RESERVED_TTBR0_SIZE (PAGE_SIZE) -#else -#define RESERVED_TTBR0_SIZE (0) -#endif - /* Initial memory map size */ #if ARM64_SWAPPER_USES_SECTION_MAPS #define SWAPPER_BLOCK_SHIFT SECTION_SHIFT diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index b3325a9cb90f..abd64bd1f6d9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -47,9 +47,6 @@ typedef unsigned long mm_segment_t; struct thread_info { unsigned long flags; /* low level flags */ mm_segment_t addr_limit; /* address limit */ -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - u64 ttbr0; /* saved TTBR0_EL1 */ -#endif struct task_struct *task; /* main task structure */ int preempt_count; /* 0 => preemptable, <0 => bug */ int cpu; /* cpu */ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index c37c064d7cdd..c8ef22a9a83b 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -28,7 +28,6 @@ #include #include -#include #include #include #include @@ -129,57 +128,16 @@ static inline void set_fs(mm_segment_t fs) /* * User access enabling/disabling. */ -#ifdef CONFIG_ARM64_SW_TTBR0_PAN -static inline void uaccess_ttbr0_disable(void) -{ - unsigned long ttbr; - - /* reserved_ttbr0 placed at the end of swapper_pg_dir */ - ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE; - write_sysreg(ttbr, ttbr0_el1); - isb(); -} - -static inline void uaccess_ttbr0_enable(void) -{ - unsigned long flags; - - /* - * Disable interrupts to avoid preemption between reading the 'ttbr0' - * variable and the MSR. A context switch could trigger an ASID - * roll-over and an update of 'ttbr0'. - */ - local_irq_save(flags); - write_sysreg(current_thread_info()->ttbr0, ttbr0_el1); - isb(); - local_irq_restore(flags); -} -#else -static inline void uaccess_ttbr0_disable(void) -{ -} - -static inline void uaccess_ttbr0_enable(void) -{ -} -#endif - #define __uaccess_disable(alt) \ do { \ - if (system_uses_ttbr0_pan()) \ - uaccess_ttbr0_disable(); \ - else \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ - CONFIG_ARM64_PAN)); \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ + CONFIG_ARM64_PAN)); \ } while (0) #define __uaccess_enable(alt) \ do { \ - if (system_uses_ttbr0_pan()) \ - uaccess_ttbr0_enable(); \ - else \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ - CONFIG_ARM64_PAN)); \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ + CONFIG_ARM64_PAN)); \ } while (0) static inline void uaccess_disable(void) @@ -409,39 +367,12 @@ extern __must_check long strnlen_user(const char __user *str, long n); #include #include -#include /* - * User access enabling/disabling macros. - */ - .macro uaccess_ttbr0_disable, tmp1 - mrs \tmp1, ttbr1_el1 // swapper_pg_dir - add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir - msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 - isb - .endm - - .macro uaccess_ttbr0_enable, tmp1 - get_thread_info \tmp1 - ldr \tmp1, [\tmp1, #TI_TTBR0] // load saved TTBR0_EL1 - msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1 - isb - .endm - -/* - * These macros are no-ops when UAO is present. + * User access enabling/disabling macros. These are no-ops when UAO is + * present. */ .macro uaccess_disable_not_uao, tmp1 -#ifdef CONFIG_ARM64_SW_TTBR0_PAN -alternative_if_not ARM64_HAS_PAN - uaccess_ttbr0_disable \tmp1 -alternative_else - nop - nop - nop - nop -alternative_endif -#endif alternative_if_not ARM64_ALT_PAN_NOT_UAO nop alternative_else @@ -450,21 +381,6 @@ alternative_endif .endm .macro uaccess_enable_not_uao, tmp1, tmp2 -#ifdef CONFIG_ARM64_SW_TTBR0_PAN -alternative_if_not ARM64_HAS_PAN - save_and_disable_irq \tmp2 // avoid preemption - uaccess_ttbr0_enable \tmp1 - restore_irq \tmp2 -alternative_else - nop - nop - nop - nop - nop - nop - nop -alternative_endif -#endif alternative_if_not ARM64_ALT_PAN_NOT_UAO nop alternative_else diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index d0ec987dba5b..087cf9a65359 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -36,9 +36,6 @@ int main(void) DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - DEFINE(TI_TTBR0, offsetof(struct thread_info, ttbr0)); -#endif DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); BLANK(); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 40ee3f2933e7..7566cad9fa1d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -43,7 +43,6 @@ unsigned int compat_elf_hwcap2 __read_mostly; #endif DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); -EXPORT_SYMBOL(cpu_hwcaps); #define __ARM64_FTR_BITS(SIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8eb8eb085036..533e1c9fd5a6 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -190,6 +190,10 @@ alternative_endif eret // return to kernel .endm + .macro get_thread_info, rd + mrs \rd, sp_el0 + .endm + .macro irq_stack_entry mov x19, sp // preserve the original sp diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 461d6cc258dd..16c62dea7934 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -322,14 +322,14 @@ __create_page_tables: * dirty cache lines being evicted. */ mov x0, x25 - add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + add x1, x26, #SWAPPER_DIR_SIZE bl __inval_cache_range /* * Clear the idmap and swapper page tables. */ mov x0, x25 - add x6, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + add x6, x26, #SWAPPER_DIR_SIZE 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 @@ -407,7 +407,7 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ mov x0, x25 - add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + add x1, x26, #SWAPPER_DIR_SIZE dmb sy bl __inval_cache_range diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index f1d6c49dcc5f..c241ea5359b9 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -185,11 +185,6 @@ SECTIONS swapper_pg_dir = .; . += SWAPPER_DIR_SIZE; -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - reserved_ttbr0 = .; - . += RESERVED_TTBR0_SIZE; -#endif - _end = .; STABS_DEBUG From 81688dbb3294585e137270f84275e6e94cbc9d7e Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 14 Dec 2016 12:32:46 -0800 Subject: [PATCH 0590/3220] Revert "FROMLIST: arm64: Factor out TTBR0_EL1 post-update workaround into a specific asm macro" This reverts commit 3b66929169de053042d47e482dd5748794756153. Bug: 31432001 Change-Id: Ib38fcf553ca2077531cbf550fbaa75378a8723c5 Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 17 ----------------- arch/arm64/mm/proc.S | 11 ++++++++++- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 9d3e77a5cf07..9e8ac1e73457 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -362,21 +362,4 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm -/* - * Errata workaround post TTBR0_EL1 update. - */ - .macro post_ttbr0_update_workaround -#ifdef CONFIG_CAVIUM_ERRATUM_27456 -alternative_if_not ARM64_WORKAROUND_CAVIUM_27456 - nop - nop - nop -alternative_else - ic iallu - dsb nsh - isb -alternative_endif -#endif - .endm - #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 765713702625..9f6deacf41d2 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -136,8 +136,17 @@ ENTRY(cpu_do_switch_mm) bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb - post_ttbr0_update_workaround +alternative_if_not ARM64_WORKAROUND_CAVIUM_27456 ret + nop + nop + nop +alternative_else + ic iallu + dsb nsh + isb + ret +alternative_endif ENDPROC(cpu_do_switch_mm) .pushsection ".idmap.text", "ax" From c86266edfaa4e39a5f784e1c9ee8ec6ba2803cc1 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 14 Dec 2016 12:32:56 -0800 Subject: [PATCH 0591/3220] Revert "FROMLIST: arm64: Factor out PAN enabling/disabling into separate uaccess_* macros" This reverts commit 23368b642deb01ac6ce668ec1dedfcc0cab25c71. Bug: 31432001 Change-Id: Ia59e5fc75ef905b89d5f9194f1e762c1e5eff5bf Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/futex.h | 14 ++--- arch/arm64/include/asm/uaccess.h | 79 +++------------------------- arch/arm64/kernel/armv8_deprecated.c | 10 ++-- arch/arm64/lib/clear_user.S | 8 +-- arch/arm64/lib/copy_from_user.S | 8 +-- arch/arm64/lib/copy_in_user.S | 8 +-- arch/arm64/lib/copy_to_user.S | 8 +-- 7 files changed, 40 insertions(+), 95 deletions(-) diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 71dfa3b42313..f2585cdd32c2 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -27,9 +27,9 @@ #include #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ -do { \ - uaccess_enable(); \ asm volatile( \ + ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ insn "\n" \ @@ -44,11 +44,11 @@ do { \ " .popsection\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ + ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) \ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ : "r" (oparg), "Ir" (-EFAULT) \ - : "memory"); \ - uaccess_disable(); \ -} while (0) + : "memory") static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) @@ -118,8 +118,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; - uaccess_enable(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" +ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" " sub %w3, %w1, %w4\n" @@ -134,10 +134,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, " .popsection\n" _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 4b) +ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) : "memory"); - uaccess_disable(); *uval = val; return ret; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index c8ef22a9a83b..c3d445b42351 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -18,8 +18,6 @@ #ifndef __ASM_UACCESS_H #define __ASM_UACCESS_H -#ifndef __ASSEMBLY__ - /* * User space memory access functions */ @@ -125,44 +123,6 @@ static inline void set_fs(mm_segment_t fs) " .long (" #from " - .), (" #to " - .)\n" \ " .popsection\n" -/* - * User access enabling/disabling. - */ -#define __uaccess_disable(alt) \ -do { \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ - CONFIG_ARM64_PAN)); \ -} while (0) - -#define __uaccess_enable(alt) \ -do { \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ - CONFIG_ARM64_PAN)); \ -} while (0) - -static inline void uaccess_disable(void) -{ - __uaccess_disable(ARM64_HAS_PAN); -} - -static inline void uaccess_enable(void) -{ - __uaccess_enable(ARM64_HAS_PAN); -} - -/* - * These functions are no-ops when UAO is present. - */ -static inline void uaccess_disable_not_uao(void) -{ - __uaccess_disable(ARM64_ALT_PAN_NOT_UAO); -} - -static inline void uaccess_enable_not_uao(void) -{ - __uaccess_enable(ARM64_ALT_PAN_NOT_UAO); -} - /* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" @@ -190,7 +150,8 @@ static inline void uaccess_enable_not_uao(void) do { \ unsigned long __gu_val; \ __chk_user_ptr(ptr); \ - uaccess_enable_not_uao(); \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ + CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ @@ -211,8 +172,9 @@ do { \ default: \ BUILD_BUG(); \ } \ - uaccess_disable_not_uao(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ + CONFIG_ARM64_PAN)); \ } while (0) #define __get_user(x, ptr) \ @@ -257,7 +219,8 @@ do { \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ - uaccess_enable_not_uao(); \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ + CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ @@ -278,7 +241,8 @@ do { \ default: \ BUILD_BUG(); \ } \ - uaccess_disable_not_uao(); \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ + CONFIG_ARM64_PAN)); \ } while (0) #define __put_user(x, ptr) \ @@ -363,31 +327,4 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); -#else /* __ASSEMBLY__ */ - -#include -#include - -/* - * User access enabling/disabling macros. These are no-ops when UAO is - * present. - */ - .macro uaccess_disable_not_uao, tmp1 -alternative_if_not ARM64_ALT_PAN_NOT_UAO - nop -alternative_else - SET_PSTATE_PAN(1) -alternative_endif - .endm - - .macro uaccess_enable_not_uao, tmp1, tmp2 -alternative_if_not ARM64_ALT_PAN_NOT_UAO - nop -alternative_else - SET_PSTATE_PAN(0) -alternative_endif - .endm - -#endif /* __ASSEMBLY__ */ - #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 562f7ddb158b..c37202c0c838 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -281,9 +281,9 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) * Error-checking SWP macros implemented using ldxr{b}/stxr{b} */ #define __user_swpX_asm(data, addr, res, temp, B) \ -do { \ - uaccess_enable(); \ __asm__ __volatile__( \ + ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) \ "0: ldxr"B" %w2, [%3]\n" \ "1: stxr"B" %w0, %w1, [%3]\n" \ " cbz %w0, 2f\n" \ @@ -299,11 +299,11 @@ do { \ " .popsection" \ _ASM_EXTABLE(0b, 4b) \ _ASM_EXTABLE(1b, 4b) \ + ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp) \ : "r" (addr), "i" (-EAGAIN), "i" (-EFAULT) \ - : "memory"); \ - uaccess_disable(); \ -} while (0) + : "memory") #define __user_swp_asm(data, addr, res, temp) \ __user_swpX_asm(data, addr, res, temp, "") diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 08b5f18ba604..5d1cad3ce6d6 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -17,10 +17,10 @@ */ #include +#include #include #include #include -#include .text @@ -33,7 +33,8 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) - uaccess_enable_not_uao x2, x3 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f @@ -53,7 +54,8 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 - uaccess_disable_not_uao x2 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 6505ec81f1da..0b90497d4424 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -16,11 +16,11 @@ #include +#include #include #include #include #include -#include /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -67,10 +67,12 @@ end .req x5 ENTRY(__arch_copy_from_user) - uaccess_enable_not_uao x3, x4 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 9b04ff3ab610..f7292dd08c84 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -18,11 +18,11 @@ #include +#include #include #include #include #include -#include /* * Copy from user space to user space (alignment handled by the hardware) @@ -68,10 +68,12 @@ end .req x5 ENTRY(__copy_in_user) - uaccess_enable_not_uao x3, x4 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) mov x0, #0 ret ENDPROC(__copy_in_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 8077e4f34d56..7a7efe255034 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -16,11 +16,11 @@ #include +#include #include #include #include #include -#include /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -66,10 +66,12 @@ end .req x5 ENTRY(__arch_copy_to_user) - uaccess_enable_not_uao x3, x4 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ + CONFIG_ARM64_PAN) mov x0, #0 ret ENDPROC(__arch_copy_to_user) From 12ac5b67a13dabdb6e9cd9c0811bcebe12a23b1c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Sep 2016 16:40:23 +0100 Subject: [PATCH 0592/3220] UPSTREAM: arm64: barriers: introduce nops and __nops macros for NOP sequences NOP sequences tend to get used for padding out alternative sections and uarch-specific pipeline flushes in errata workarounds. This patch adds macros for generating these sequences as both inline asm blocks, but also as strings suitable for embedding in other asm blocks directly. Signed-off-by: Will Deacon Bug: 31432001 Change-Id: I7f82b677a065ede302a763d39ffcc3fef83f8fbe (cherry picked from commit f99a250cb6a3b301b101b4c0f5fcb80593bba6dc) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 9 +++++++++ arch/arm64/include/asm/barrier.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 9e8ac1e73457..bacc75b6c78c 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -98,6 +98,15 @@ dmb \opt .endm +/* + * NOP sequence + */ + .macro nops, num + .rept \num + nop + .endr + .endm + /* * Emit an entry into the exception table */ diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 9622eb48f894..c5dbc5cb8f10 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -20,6 +20,9 @@ #ifndef __ASSEMBLY__ +#define __nops(n) ".rept " #n "\nnop\n.endr\n" +#define nops(n) asm volatile(__nops(n)) + #define sev() asm volatile("sev" : : : "memory") #define wfe() asm volatile("wfe" : : : "memory") #define wfi() asm volatile("wfi" : : : "memory") From 50874355b1f54648e28504df38e772f9e52bc2f9 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 7 Sep 2016 11:07:08 +0100 Subject: [PATCH 0593/3220] UPSTREAM: arm64: alternative: add auto-nop infrastructure In some cases, one side of an alternative sequence is simply a number of NOPs used to balance the other side. Keeping track of this manually is tedious, and the presence of large chains of NOPs makes the code more painful to read than necessary. To ameliorate matters, this patch adds a new alternative_else_nop_endif, which automatically balances an alternative sequence with a trivial NOP sled. In many cases, we would like a NOP-sled in the default case, and instructions patched in in the presence of a feature. To enable the NOPs to be generated automatically for this case, this patch also adds a new alternative_if, and updates alternative_else and alternative_endif to work with either alternative_if or alternative_endif. Cc: Andre Przywara Cc: Catalin Marinas Cc: Dave Martin Cc: James Morse Signed-off-by: Mark Rutland [will: use new nops macro to generate nop sequences] Signed-off-by: Will Deacon Bug: 31432001 Change-Id: I28d8aae073e113048577c41cfe27c91215fb4cf3 (cherry picked from commit 792d47379f4d4c76692f1795f33d38582f8907fa) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/alternative.h | 72 +++++++++++++++++++++------- 1 file changed, 54 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 8746ff6abd77..55101bd86b98 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -2,6 +2,7 @@ #define __ASM_ALTERNATIVE_H #include +#include #ifndef __ASSEMBLY__ @@ -90,24 +91,15 @@ void apply_alternatives(void *start, size_t length); .endm /* - * Begin an alternative code sequence. + * Alternative sequences * - * The code that follows this macro will be assembled and linked as - * normal. There are no restrictions on this code. - */ -.macro alternative_if_not cap - .pushsection .altinstructions, "a" - altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f - .popsection -661: -.endm - -/* - * Provide the alternative code sequence. + * The code for the case where the capability is not present will be + * assembled and linked as normal. There are no restrictions on this + * code. * - * The code that follows this macro is assembled into a special - * section to be used for dynamic patching. Code that follows this - * macro must: + * The code for the case where the capability is present will be + * assembled into a special section to be used for dynamic patching. + * Code for that case must: * * 1. Be exactly the same length (in bytes) as the default code * sequence. @@ -116,8 +108,38 @@ void apply_alternatives(void *start, size_t length); * alternative sequence it is defined in (branches into an * alternative sequence are not fixed up). */ + +/* + * Begin an alternative code sequence. + */ +.macro alternative_if_not cap + .set .Lasm_alt_mode, 0 + .pushsection .altinstructions, "a" + altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f + .popsection +661: +.endm + +.macro alternative_if cap + .set .Lasm_alt_mode, 1 + .pushsection .altinstructions, "a" + altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f + .popsection + .pushsection .altinstr_replacement, "ax" + .align 2 /* So GAS knows label 661 is suitably aligned */ +661: +.endm + +/* + * Provide the other half of the alternative code sequence. + */ .macro alternative_else -662: .pushsection .altinstr_replacement, "ax" +662: + .if .Lasm_alt_mode==0 + .pushsection .altinstr_replacement, "ax" + .else + .popsection + .endif 663: .endm @@ -125,11 +147,25 @@ void apply_alternatives(void *start, size_t length); * Complete an alternative code sequence. */ .macro alternative_endif -664: .popsection +664: + .if .Lasm_alt_mode==0 + .popsection + .endif .org . - (664b-663b) + (662b-661b) .org . - (662b-661b) + (664b-663b) .endm +/* + * Provides a trivial alternative or default sequence consisting solely + * of NOPs. The number of NOPs is chosen automatically to match the + * previous case. + */ +.macro alternative_else_nop_endif +alternative_else + nops (662b-661b) / AARCH64_INSN_SIZE +alternative_endif +.endm + #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) From a858462e1f9fdd5e67cf4cc94fa3fc3fa9846228 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 14:58:21 +0100 Subject: [PATCH 0594/3220] BACKPORT: arm64: Factor out PAN enabling/disabling into separate uaccess_* macros This patch moves the directly coded alternatives for turning PAN on/off into separate uaccess_{enable,disable} macros or functions. The asm macros take a few arguments which will be used in subsequent patches. Note that any (unlikely) access that the compiler might generate between uaccess_enable() and uaccess_disable(), other than those explicitly specified by the user access code, will not be protected by PAN. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Reviewed-by: Mark Rutland Signed-off-by: Catalin Marinas Bug: 31432001 Change-Id: I75a410139d0756edab3210ee091fa5d047a22e04 (cherry picked from commit bd38967d406fb4f9fca67d612db71b5d74cfb0f5) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/futex.h | 17 +++--- arch/arm64/include/asm/uaccess.h | 79 ++++++++++++++++++++++++---- arch/arm64/kernel/armv8_deprecated.c | 11 ++-- arch/arm64/lib/clear_user.S | 11 ++-- arch/arm64/lib/copy_from_user.S | 11 ++-- arch/arm64/lib/copy_in_user.S | 11 ++-- arch/arm64/lib/copy_to_user.S | 11 ++-- 7 files changed, 93 insertions(+), 58 deletions(-) diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index f2585cdd32c2..85c4a8981d47 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -21,15 +21,12 @@ #include #include -#include -#include #include -#include #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ +do { \ + uaccess_enable(); \ asm volatile( \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ insn "\n" \ @@ -44,11 +41,11 @@ " .popsection\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ : "r" (oparg), "Ir" (-EFAULT) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) @@ -118,8 +115,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; + uaccess_enable(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" -ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" " sub %w3, %w1, %w4\n" @@ -134,10 +131,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " .popsection\n" _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 4b) -ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) : "memory"); + uaccess_disable(); *uval = val; return ret; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index c3d445b42351..8259eded53dc 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -18,16 +18,19 @@ #ifndef __ASM_UACCESS_H #define __ASM_UACCESS_H +#include +#include + +#ifndef __ASSEMBLY__ + /* * User space memory access functions */ #include #include -#include #include #include -#include #include #include #include @@ -123,6 +126,44 @@ static inline void set_fs(mm_segment_t fs) " .long (" #from " - .), (" #to " - .)\n" \ " .popsection\n" +/* + * User access enabling/disabling. + */ +#define __uaccess_disable(alt) \ +do { \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +#define __uaccess_enable(alt) \ +do { \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +static inline void uaccess_disable(void) +{ + __uaccess_disable(ARM64_HAS_PAN); +} + +static inline void uaccess_enable(void) +{ + __uaccess_enable(ARM64_HAS_PAN); +} + +/* + * These functions are no-ops when UAO is present. + */ +static inline void uaccess_disable_not_uao(void) +{ + __uaccess_disable(ARM64_ALT_PAN_NOT_UAO); +} + +static inline void uaccess_enable_not_uao(void) +{ + __uaccess_enable(ARM64_ALT_PAN_NOT_UAO); +} + /* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" @@ -150,8 +191,7 @@ static inline void set_fs(mm_segment_t fs) do { \ unsigned long __gu_val; \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ @@ -172,9 +212,8 @@ do { \ default: \ BUILD_BUG(); \ } \ + uaccess_disable_not_uao(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ } while (0) #define __get_user(x, ptr) \ @@ -219,8 +258,7 @@ do { \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ @@ -241,8 +279,7 @@ do { \ default: \ BUILD_BUG(); \ } \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_disable_not_uao(); \ } while (0) #define __put_user(x, ptr) \ @@ -327,4 +364,26 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); +#else /* __ASSEMBLY__ */ + +#include + +/* + * User access enabling/disabling macros. These are no-ops when UAO is + * present. + */ + .macro uaccess_disable_not_uao, tmp1 +alternative_if ARM64_ALT_PAN_NOT_UAO + SET_PSTATE_PAN(1) +alternative_else_nop_endif + .endm + + .macro uaccess_enable_not_uao, tmp1, tmp2 +alternative_if ARM64_ALT_PAN_NOT_UAO + SET_PSTATE_PAN(0) +alternative_else_nop_endif + .endm + +#endif /* __ASSEMBLY__ */ + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index c37202c0c838..74dc6c1d97ee 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -14,7 +14,6 @@ #include #include -#include #include #include #include @@ -281,9 +280,9 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) * Error-checking SWP macros implemented using ldxr{b}/stxr{b} */ #define __user_swpX_asm(data, addr, res, temp, B) \ +do { \ + uaccess_enable(); \ __asm__ __volatile__( \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ "0: ldxr"B" %w2, [%3]\n" \ "1: stxr"B" %w0, %w1, [%3]\n" \ " cbz %w0, 2f\n" \ @@ -299,11 +298,11 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) " .popsection" \ _ASM_EXTABLE(0b, 4b) \ _ASM_EXTABLE(1b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp) \ : "r" (addr), "i" (-EAGAIN), "i" (-EFAULT) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) #define __user_swp_asm(data, addr, res, temp) \ __user_swpX_asm(data, addr, res, temp, "") diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 5d1cad3ce6d6..d7150e30438a 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -17,10 +17,7 @@ */ #include -#include -#include -#include -#include +#include .text @@ -33,8 +30,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x2, x3 mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f @@ -54,8 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x2 ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 0b90497d4424..90154f3f7f2a 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -16,11 +16,8 @@ #include -#include -#include #include -#include -#include +#include /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -67,12 +64,10 @@ end .req x5 ENTRY(__arch_copy_from_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index f7292dd08c84..718b1c4e2f85 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -18,11 +18,8 @@ #include -#include -#include #include -#include -#include +#include /* * Copy from user space to user space (alignment handled by the hardware) @@ -68,12 +65,10 @@ end .req x5 ENTRY(__copy_in_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__copy_in_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 7a7efe255034..e99e31c9acac 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -16,11 +16,8 @@ #include -#include -#include #include -#include -#include +#include /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -66,12 +63,10 @@ end .req x5 ENTRY(__arch_copy_to_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__arch_copy_to_user) From 4fe5f2cb94341423c484483625215628134906ca Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 15:48:55 +0100 Subject: [PATCH 0595/3220] BACKPORT: arm64: Factor out TTBR0_EL1 post-update workaround into a specific asm macro This patch takes the errata workaround code out of cpu_do_switch_mm into a dedicated post_ttbr0_update_workaround macro which will be reused in a subsequent patch. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Reviewed-by: Mark Rutland Signed-off-by: Catalin Marinas Bug: 31432001 Change-Id: I2b45b11ab7390c3545b9e162532109c1526bef14 (cherry picked from commit f33bcf03e6079668da6bf4eec4a7dcf9289131d0) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 13 +++++++++++++ arch/arm64/mm/proc.S | 11 +---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index bacc75b6c78c..7b2a8925ac86 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -371,4 +371,17 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm +/* + * Errata workaround post TTBR0_EL1 update. + */ + .macro post_ttbr0_update_workaround +#ifdef CONFIG_CAVIUM_ERRATUM_27456 +alternative_if ARM64_WORKAROUND_CAVIUM_27456 + ic iallu + dsb nsh + isb +alternative_else_nop_endif +#endif + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 9f6deacf41d2..765713702625 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -136,17 +136,8 @@ ENTRY(cpu_do_switch_mm) bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb -alternative_if_not ARM64_WORKAROUND_CAVIUM_27456 + post_ttbr0_update_workaround ret - nop - nop - nop -alternative_else - ic iallu - dsb nsh - isb - ret -alternative_endif ENDPROC(cpu_do_switch_mm) .pushsection ".idmap.text", "ax" From b83fbf1e77257a9436b27f487925c7d3347f901f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 16:53:00 +0100 Subject: [PATCH 0596/3220] BACKPORT: arm64: Introduce uaccess_{disable,enable} functionality based on TTBR0_EL1 This patch adds the uaccess macros/functions to disable access to user space by setting TTBR0_EL1 to a reserved zeroed page. Since the value written to TTBR0_EL1 must be a physical address, for simplicity this patch introduces a reserved_ttbr0 page at a constant offset from swapper_pg_dir. The uaccess_disable code uses the ttbr1_el1 value adjusted by the reserved_ttbr0 offset. Enabling access to user is done by restoring TTBR0_EL1 with the value from the struct thread_info ttbr0 variable. Interrupts must be disabled during the uaccess_ttbr0_enable code to ensure the atomicity of the thread_info.ttbr0 read and TTBR0_EL1 write. This patch also moves the get_thread_info asm macro from entry.S to assembler.h for reuse in the uaccess_ttbr0_* macros. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Bug: 31432001 Change-Id: I54ada623160cb47f5762e0e39a5e84a75252dbfd (cherry picked from commit 4b65a5db362783ab4b04ca1c1d2ad70ed9b0ba2a) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/assembler.h | 16 ++++ arch/arm64/include/asm/cpufeature.h | 6 ++ arch/arm64/include/asm/kernel-pgtable.h | 7 ++ arch/arm64/include/asm/thread_info.h | 3 + arch/arm64/include/asm/uaccess.h | 108 ++++++++++++++++++++++-- arch/arm64/kernel/asm-offsets.c | 3 + arch/arm64/kernel/cpufeature.c | 1 + arch/arm64/kernel/entry.S | 4 - arch/arm64/kernel/head.S | 6 +- arch/arm64/kernel/vmlinux.lds.S | 5 ++ 10 files changed, 146 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 7b2a8925ac86..d8855ca6068a 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -53,6 +53,15 @@ msr daifclr, #2 .endm + .macro save_and_disable_irq, flags + mrs \flags, daif + msr daifset, #2 + .endm + + .macro restore_irq, flags + msr daif, \flags + .endm + /* * Enable and disable debug exceptions. */ @@ -371,6 +380,13 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm +/* + * Return the current thread_info. + */ + .macro get_thread_info, rd + mrs \rd, sp_el0 + .endm + /* * Errata workaround post TTBR0_EL1 update. */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 727e594ac5c2..f125c03ab2e1 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -188,6 +188,12 @@ static inline bool system_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1)); } +static inline bool system_uses_ttbr0_pan(void) +{ + return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && + !cpus_have_cap(ARM64_HAS_PAN); +} + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 7e51d1b57c0c..7803343e5881 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -19,6 +19,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H +#include #include /* @@ -54,6 +55,12 @@ #define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE) #define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +#define RESERVED_TTBR0_SIZE (PAGE_SIZE) +#else +#define RESERVED_TTBR0_SIZE (0) +#endif + /* Initial memory map size */ #if ARM64_SWAPPER_USES_SECTION_MAPS #define SWAPPER_BLOCK_SHIFT SECTION_SHIFT diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index abd64bd1f6d9..794d22603f04 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -48,6 +48,9 @@ struct thread_info { unsigned long flags; /* low level flags */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + u64 ttbr0; /* saved TTBR0_EL1 */ +#endif int preempt_count; /* 0 => preemptable, <0 => bug */ int cpu; /* cpu */ }; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 8259eded53dc..955c6e58a624 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -19,6 +19,7 @@ #define __ASM_UACCESS_H #include +#include #include #ifndef __ASSEMBLY__ @@ -129,16 +130,71 @@ static inline void set_fs(mm_segment_t fs) /* * User access enabling/disabling. */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void __uaccess_ttbr0_disable(void) +{ + unsigned long ttbr; + + /* reserved_ttbr0 placed at the end of swapper_pg_dir */ + ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE; + write_sysreg(ttbr, ttbr0_el1); + isb(); +} + +static inline void __uaccess_ttbr0_enable(void) +{ + unsigned long flags; + + /* + * Disable interrupts to avoid preemption between reading the 'ttbr0' + * variable and the MSR. A context switch could trigger an ASID + * roll-over and an update of 'ttbr0'. + */ + local_irq_save(flags); + write_sysreg(current_thread_info()->ttbr0, ttbr0_el1); + isb(); + local_irq_restore(flags); +} + +static inline bool uaccess_ttbr0_disable(void) +{ + if (!system_uses_ttbr0_pan()) + return false; + __uaccess_ttbr0_disable(); + return true; +} + +static inline bool uaccess_ttbr0_enable(void) +{ + if (!system_uses_ttbr0_pan()) + return false; + __uaccess_ttbr0_enable(); + return true; +} +#else +static inline bool uaccess_ttbr0_disable(void) +{ + return false; +} + +static inline bool uaccess_ttbr0_enable(void) +{ + return false; +} +#endif + #define __uaccess_disable(alt) \ do { \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ - CONFIG_ARM64_PAN)); \ + if (!uaccess_ttbr0_disable()) \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ + CONFIG_ARM64_PAN)); \ } while (0) #define __uaccess_enable(alt) \ do { \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ - CONFIG_ARM64_PAN)); \ + if (uaccess_ttbr0_enable()) \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ + CONFIG_ARM64_PAN)); \ } while (0) static inline void uaccess_disable(void) @@ -369,16 +425,56 @@ extern __must_check long strnlen_user(const char __user *str, long n); #include /* - * User access enabling/disabling macros. These are no-ops when UAO is - * present. + * User access enabling/disabling macros. + */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + .macro __uaccess_ttbr0_disable, tmp1 + mrs \tmp1, ttbr1_el1 // swapper_pg_dir + add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir + msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 + isb + .endm + + .macro __uaccess_ttbr0_enable, tmp1 + get_thread_info \tmp1 + ldr \tmp1, [\tmp1, #TSK_TI_TTBR0] // load saved TTBR0_EL1 + msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1 + isb + .endm + + .macro uaccess_ttbr0_disable, tmp1 +alternative_if_not ARM64_HAS_PAN + __uaccess_ttbr0_disable \tmp1 +alternative_else_nop_endif + .endm + + .macro uaccess_ttbr0_enable, tmp1, tmp2 +alternative_if_not ARM64_HAS_PAN + save_and_disable_irq \tmp2 // avoid preemption + __uaccess_ttbr0_enable \tmp1 + restore_irq \tmp2 +alternative_else_nop_endif + .endm +#else + .macro uaccess_ttbr0_disable, tmp1 + .endm + + .macro uaccess_ttbr0_enable, tmp1, tmp2 + .endm +#endif + +/* + * These macros are no-ops when UAO is present. */ .macro uaccess_disable_not_uao, tmp1 + uaccess_ttbr0_disable \tmp1 alternative_if ARM64_ALT_PAN_NOT_UAO SET_PSTATE_PAN(1) alternative_else_nop_endif .endm .macro uaccess_enable_not_uao, tmp1, tmp2 + uaccess_ttbr0_enable \tmp1, \tmp2 alternative_if ARM64_ALT_PAN_NOT_UAO SET_PSTATE_PAN(0) alternative_else_nop_endif diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 087cf9a65359..40ef661bd3a5 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -38,6 +38,9 @@ int main(void) DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TSK_TI_TTBR0, offsetof(struct thread_info, ttbr0)); +#endif BLANK(); DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context)); BLANK(); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7566cad9fa1d..40ee3f2933e7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -43,6 +43,7 @@ unsigned int compat_elf_hwcap2 __read_mostly; #endif DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); +EXPORT_SYMBOL(cpu_hwcaps); #define __ARM64_FTR_BITS(SIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 533e1c9fd5a6..8eb8eb085036 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -190,10 +190,6 @@ alternative_endif eret // return to kernel .endm - .macro get_thread_info, rd - mrs \rd, sp_el0 - .endm - .macro irq_stack_entry mov x19, sp // preserve the original sp diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 16c62dea7934..461d6cc258dd 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -322,14 +322,14 @@ __create_page_tables: * dirty cache lines being evicted. */ mov x0, x25 - add x1, x26, #SWAPPER_DIR_SIZE + add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE bl __inval_cache_range /* * Clear the idmap and swapper page tables. */ mov x0, x25 - add x6, x26, #SWAPPER_DIR_SIZE + add x6, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 @@ -407,7 +407,7 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ mov x0, x25 - add x1, x26, #SWAPPER_DIR_SIZE + add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE dmb sy bl __inval_cache_range diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index c241ea5359b9..f1d6c49dcc5f 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -185,6 +185,11 @@ SECTIONS swapper_pg_dir = .; . += SWAPPER_DIR_SIZE; +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + reserved_ttbr0 = .; + . += RESERVED_TTBR0_SIZE; +#endif + _end = .; STABS_DEBUG From 02ef7a8c3c9dedda83188574cd4d3071d307b814 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 2 Sep 2016 14:54:03 +0100 Subject: [PATCH 0597/3220] BACKPORT: arm64: Disable TTBR0_EL1 during normal kernel execution When the TTBR0 PAN feature is enabled, the kernel entry points need to disable access to TTBR0_EL1. The PAN status of the interrupted context is stored as part of the saved pstate, reusing the PSR_PAN_BIT (22). Restoring access to TTBR0_EL1 is done on exception return if returning to user or returning to a context where PAN was disabled. Context switching via switch_mm() must defer the update of TTBR0_EL1 until a return to user or an explicit uaccess_enable() call. Special care needs to be taken for two cases where TTBR0_EL1 is set outside the normal kernel context switch operation: EFI run-time services (via efi_set_pgd) and CPU suspend (via cpu_(un)install_idmap). Code has been added to avoid deferred TTBR0_EL1 switching as in switch_mm() and restore the reserved TTBR0_EL1 when uninstalling the special TTBR0_EL1. User cache maintenance (user_cache_maint_handler and __flush_cache_user_range) needs the TTBR0_EL1 re-instated since the operations are performed by user virtual address. This patch also removes a stale comment on the switch_mm() function. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Bug: 31432001 Change-Id: I85a49f70e13b153b9903851edf56f6531c14e6de (cherry picked from commit 39bc88e5e38e9b213bd7d833ce0df6ec029761ad) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/efi.h | 26 +++++++++++- arch/arm64/include/asm/mmu_context.h | 53 ++++++++++++++++------- arch/arm64/kernel/entry.S | 63 ++++++++++++++++++++++++++++ arch/arm64/kernel/setup.c | 9 ++++ arch/arm64/mm/cache.S | 6 ++- arch/arm64/mm/context.c | 7 +++- 6 files changed, 147 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 8e88a696c9cb..932f5a56d1a6 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -1,6 +1,7 @@ #ifndef _ASM_EFI_H #define _ASM_EFI_H +#include #include #include #include @@ -69,7 +70,30 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); static inline void efi_set_pgd(struct mm_struct *mm) { - switch_mm(NULL, mm, NULL); + __switch_mm(mm); + + if (system_uses_ttbr0_pan()) { + if (mm != current->active_mm) { + /* + * Update the current thread's saved ttbr0 since it is + * restored as part of a return from exception. Set + * the hardware TTBR0_EL1 using cpu_switch_mm() + * directly to enable potential errata workarounds. + */ + update_saved_ttbr0(current, mm); + cpu_switch_mm(mm->pgd, mm); + } else { + /* + * Defer the switch to the current thread's TTBR0_EL1 + * until uaccess_enable(). Restore the current + * thread's saved ttbr0 corresponding to its active_mm + * (if different from init_mm). + */ + cpu_set_reserved_ttbr0(); + if (current->active_mm != &init_mm) + update_saved_ttbr0(current, current->active_mm); + } + } } void efi_virtmap_load(void); diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index a00f7cf35bbd..e53d30c6f779 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -113,7 +114,7 @@ static inline void cpu_uninstall_idmap(void) local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); - if (mm != &init_mm) + if (mm != &init_mm && !system_uses_ttbr0_pan()) cpu_switch_mm(mm->pgd, mm); } @@ -173,21 +174,27 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { } -/* - * This is the actual mm switch as far as the scheduler - * is concerned. No registers are touched. We avoid - * calling the CPU specific function when the mm hasn't - * actually changed. - */ -static inline void -switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) +{ + if (system_uses_ttbr0_pan()) { + BUG_ON(mm->pgd == swapper_pg_dir); + task_thread_info(tsk)->ttbr0 = + virt_to_phys(mm->pgd) | ASID(mm) << 48; + } +} +#else +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) +{ +} +#endif + +static inline void __switch_mm(struct mm_struct *next) { unsigned int cpu = smp_processor_id(); - if (prev == next) - return; - /* * init_mm.pgd does not contain any user mappings and it is always * active for kernel addresses in TTBR1. Just set the reserved TTBR0. @@ -200,7 +207,25 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, check_and_switch_context(next, cpu); } +static inline void +switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + if (prev != next) + __switch_mm(next); + + /* + * Update the saved TTBR0_EL1 of the scheduled-in task as the previous + * value may have not been initialised yet (activate_mm caller) or the + * ASID has changed since the last run (following the context switch + * of another thread of the same process). Avoid setting the reserved + * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit). + */ + if (next != &init_mm) + update_saved_ttbr0(tsk, next); +} + #define deactivate_mm(tsk,mm) do { } while (0) -#define activate_mm(prev,next) switch_mm(prev, next, NULL) +#define activate_mm(prev,next) switch_mm(prev, next, current) #endif diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8eb8eb085036..d6f10ab36c15 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,7 +29,9 @@ #include #include #include +#include #include +#include #include /* @@ -109,6 +111,32 @@ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +alternative_if ARM64_HAS_PAN + b 1f // skip TTBR0 PAN +alternative_else_nop_endif + + .if \el != 0 + mrs x21, ttbr0_el1 + tst x21, #0xffff << 48 // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR + .endif + + __uaccess_ttbr0_disable x21 +1: +#endif + stp x22, x23, [sp, #S_PC] /* @@ -147,6 +175,40 @@ ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter + .endif + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. + */ +alternative_if ARM64_HAS_PAN + b 2f // skip TTBR0 PAN +alternative_else_nop_endif + + .if \el != 0 + tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set + .endif + + __uaccess_ttbr0_enable x0 + + .if \el == 0 + /* + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). + */ + post_ttbr0_update_workaround + .endif +1: + .if \el != 0 + and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + .endif +2: +#endif + + .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 #ifdef CONFIG_ARM64_ERRATUM_845719 @@ -168,6 +230,7 @@ alternative_else alternative_endif #endif .endif + msr elr_el1, x21 // set up the return data msr spsr_el1, x22 ldp x0, x1, [sp, #16 * 0] diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 29b8c247d56f..6591bf23422b 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -347,6 +347,15 @@ void __init setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Make sure init_thread_info.ttbr0 always generates translation + * faults in case uaccess_enable() is inadvertently called by the init + * thread. + */ + init_thread_info.ttbr0 = virt_to_phys(empty_zero_page); +#endif + #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 07d7352d7c38..3be2cda5dbda 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -23,6 +23,7 @@ #include #include #include +#include /* * flush_icache_range(start,end) @@ -48,6 +49,7 @@ ENTRY(flush_icache_range) * - end - virtual end address of region */ ENTRY(__flush_cache_user_range) + uaccess_ttbr0_enable x2, x3 dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x0, x3 @@ -69,10 +71,12 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU dsb ish isb mov x0, #0 +1: + uaccess_ttbr0_disable x1 ret 9: mov x0, #-EFAULT - ret + b 1b ENDPROC(flush_icache_range) ENDPROC(__flush_cache_user_range) diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 7275628ba59f..25128089c386 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -182,7 +182,12 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: - cpu_switch_mm(mm->pgd, mm); + /* + * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when + * emulating PAN. + */ + if (!system_uses_ttbr0_pan()) + cpu_switch_mm(mm->pgd, mm); } static int asids_init(void) From 17080bcf2c6ec3e34b36f68d7e4b37b55ff9d16f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 18:22:39 +0100 Subject: [PATCH 0598/3220] UPSTREAM: arm64: Handle faults caused by inadvertent user access with PAN enabled When TTBR0_EL1 is set to the reserved page, an erroneous kernel access to user space would generate a translation fault. This patch adds the checks for the software-set PSR_PAN_BIT to emulate a permission fault and report it accordingly. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Bug: 31432001 Change-Id: I87e48f6075f84878e4d26d4fadf6eaac49d2cb4e (cherry picked from commit 786889636ad75296c213547d1ca656af4c59f390) Signed-off-by: Sami Tolvanen --- arch/arm64/mm/fault.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 0aacbd763e6b..bff871f5b10a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -246,13 +246,19 @@ out: return fault; } -static inline bool is_permission_fault(unsigned int esr) +static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs) { unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) || - (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM); + if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) + return false; + + if (system_uses_ttbr0_pan()) + return fsc_type == ESR_ELx_FSC_FAULT && + (regs->pstate & PSR_PAN_BIT); + else + return fsc_type == ESR_ELx_FSC_PERM; } static bool is_el0_instruction_abort(unsigned int esr) @@ -293,7 +299,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (is_permission_fault(esr) && (addr < USER_DS)) { + if (addr < USER_DS && is_permission_fault(esr, regs)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); From 9fcab0c5b4280c6fc860b23ae6335712a4eb77c6 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 5 Jul 2016 12:25:15 +0100 Subject: [PATCH 0599/3220] UPSTREAM: arm64: xen: Enable user access before a privcmd hvc call Privcmd calls are issued by the userspace. The kernel needs to enable access to TTBR0_EL1 as the hypervisor would issue stage 1 translations to user memory via AT instructions. Since AT instructions are not affected by the PAN bit (ARMv8.1), we only need the explicit uaccess_enable/disable if the TTBR0 PAN option is enabled. Reviewed-by: Julien Grall Acked-by: Stefano Stabellini Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Bug: 31432001 Change-Id: I64d827923d869c1868702c8a18efa99ea91d3151 (cherry picked from commit 9cf09d68b89ae5fe0261dcc69464bcc676900af6) Signed-off-by: Sami Tolvanen --- arch/arm64/xen/hypercall.S | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 8bbe9401f4f0..b96db5dafec4 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -49,6 +49,7 @@ #include #include +#include #include @@ -89,6 +90,20 @@ ENTRY(privcmd_call) mov x2, x3 mov x3, x4 mov x4, x5 + /* + * Privcmd calls are issued by the userspace. The kernel needs to + * enable access to TTBR0_EL1 as the hypervisor would issue stage 1 + * translations to user memory via AT instructions. Since AT + * instructions are not affected by the PAN bit (ARMv8.1), we only + * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation + * is enabled (it implies that hardware UAO and PAN disabled). + */ + uaccess_ttbr0_enable x6, x7 hvc XEN_IMM + + /* + * Disable userspace access from kernel once the hyp call completed. + */ + uaccess_ttbr0_disable x6 ret ENDPROC(privcmd_call); From 7f89f7225caf80f6d23a3396ca4cc1fd3ccd4685 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 1 Jul 2016 18:25:31 +0100 Subject: [PATCH 0600/3220] UPSTREAM: arm64: Enable CONFIG_ARM64_SW_TTBR0_PAN This patch adds the Kconfig option to enable support for TTBR0 PAN emulation. The option is default off because of a slight performance hit when enabled, caused by the additional TTBR0_EL1 switching during user access operations or exception entry/exit code. Cc: Will Deacon Cc: James Morse Cc: Kees Cook Cc: Mark Rutland Signed-off-by: Catalin Marinas Bug: 31432001 Change-Id: I2f0b5f332e3c56ea0453ff69826525dec49f034b (cherry picked from commit ba42822af1c287f038aa550f3578c61c212a892e) Signed-off-by: Sami Tolvanen --- arch/arm64/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 095a3afb1e9d..a9089fa82221 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -706,6 +706,14 @@ config SETEND_EMULATION If unsure, say Y endif +config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved + zeroed area and reserved ASID. The user access routines + restore the valid TTBR0_EL1 temporarily. + menu "ARMv8.1 architectural features" config ARM64_HW_AFDBM From 5937c0601163e53fe5595692a8fedd428034b017 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 12 Dec 2016 13:50:26 +0000 Subject: [PATCH 0601/3220] UPSTREAM: arm64: Disable PAN on uaccess_enable() Commit 4b65a5db3627 ("arm64: Introduce uaccess_{disable,enable} functionality based on TTBR0_EL1") added conditional user access enable/disable. Unfortunately, a typo prevents the PAN bit from being cleared for user access functions. Restore the PAN functionality by adding the missing '!'. Fixes: b65a5db3627 ("arm64: Introduce uaccess_{disable,enable} functionality based on TTBR0_EL1") Reported-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Catalin Marinas Bug: 31432001 Change-Id: If61cb6cc756affc7df7fa06213723a8b96eb1e80 (cherry picked from commit 75037120e62b58c536999eb23d70cfcb6d6c0bcc) Signed-off-by: Sami Tolvanen --- arch/arm64/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 955c6e58a624..efafdf39cb3b 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -192,7 +192,7 @@ do { \ #define __uaccess_enable(alt) \ do { \ - if (uaccess_ttbr0_enable()) \ + if (!uaccess_ttbr0_enable()) \ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ CONFIG_ARM64_PAN)); \ } while (0) From d5dce523ee59d843a8292d887333fe2fd1c7277a Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 4 Jan 2017 09:11:04 -0800 Subject: [PATCH 0602/3220] ANDROID: configs: CONFIG_ARM64_SW_TTBR0_PAN=y Bug: 31432001 Change-Id: Ia72c3aa70a463d3a7f52b76e5082520aa328d29b Signed-off-by: Sami Tolvanen --- android/configs/android-recommended.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 3fd0b13488a1..70aaae17ad29 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -8,6 +8,7 @@ # CONFIG_VT is not set CONFIG_ANDROID_TIMED_GPIO=y CONFIG_ARM_KERNMEM_PERMS=y +CONFIG_ARM64_SW_TTBR0_PAN=y CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y From d5ed6f6f5f92d98f1f4329ff1c5a19fbb0f25550 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 30 Dec 2016 17:42:32 -0600 Subject: [PATCH 0603/3220] net: socket: don't set sk_uid to garbage value in ->setattr() ->setattr() was recently implemented for socket files to sync the socket inode's uid to the new 'sk_uid' member of struct sock. It does this by copying over the ia_uid member of struct iattr. However, ia_uid is actually only valid when ATTR_UID is set in ia_valid, indicating that the uid is being changed, e.g. by chown. Other metadata operations such as chmod or utimes leave ia_uid uninitialized. Therefore, sk_uid could be set to a "garbage" value from the stack. Fix this by only copying the uid over when ATTR_UID is set. [cherry-pick of net e1a3a60a2ebe991605acb14cd58e39c0545e174e] Bug: 16355602 Change-Id: I20e53848e54282b72a388ce12bfa88da5e3e9efe Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Signed-off-by: Eric Biggers Tested-by: Lorenzo Colitti Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/socket.c b/net/socket.c index 1012991fb560..29c20061704e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -524,7 +524,7 @@ int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) { int err = simple_setattr(dentry, iattr); - if (!err) { + if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); sock->sk->sk_uid = iattr->ia_uid; From 3e61a942c7eb5378c2a1c31bc3236995b3bdf7f2 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 21 Apr 2016 18:04:53 +0100 Subject: [PATCH 0604/3220] MIPS: Prevent "restoration" of MSA context in non-MSA kernels commit 6533af4d4831c421cd9aa4dce7cfc19a3514cc09 upstream. If a kernel doesn't support MSA context (ie. CONFIG_CPU_HAS_MSA=n) then it will only keep 64 bits per FP register in thread context, and the calls to set_fpr64 in restore_msa_extcontext will overrun the end of the FP register context into the FCSR & MSACSR values. GCC 6.x has become smart enough to detect this & complain like so: arch/mips/kernel/signal.c: In function 'protected_restore_fp_context': ./arch/mips/include/asm/processor.h:114:17: error: array subscript is above array bounds [-Werror=array-bounds] fpr->val##width[FPR_IDX(width, idx)] = val; \ ~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~ ./arch/mips/include/asm/processor.h:118:1: note: in expansion of macro 'BUILD_FPR_ACCESS' BUILD_FPR_ACCESS(64) The only way to trigger this code to run would be for a program to set up an artificial extended MSA context structure following a sigframe & execute sigreturn. Whilst this doesn't allow a program to write to any state that it couldn't already, it makes little sense to allow this "restoration" of MSA context in a system that doesn't support MSA. Fix this by killing a program with SIGSYS if it tries something as crazy as "restoring" fake MSA context in this way, also fixing the build error & allowing for most of restore_msa_extcontext to be optimised out of kernels without support for MSA. Signed-off-by: Paul Burton Reported-by: Michal Toman Fixes: bf82cb30c7e5 ("MIPS: Save MSA extended context around signals") Tested-by: Aaro Koskinen Cc: James Hogan Cc: Michal Toman Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/13164/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/signal.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index bf792e2839a6..fc7c1f0b3d8d 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -195,6 +195,9 @@ static int restore_msa_extcontext(void __user *buf, unsigned int size) unsigned int csr; int i, err; + if (!config_enabled(CONFIG_CPU_HAS_MSA)) + return SIGSYS; + if (size != sizeof(*msa)) return -EINVAL; @@ -398,8 +401,8 @@ int protected_restore_fp_context(void __user *sc) } fp_done: - if (used & USED_EXTCONTEXT) - err |= restore_extcontext(sc_to_extcontext(sc)); + if (!err && (used & USED_EXTCONTEXT)) + err = restore_extcontext(sc_to_extcontext(sc)); return err ?: sig; } From 1475f79c87f07664f9af64f5d2e77d360855ac28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Dec 2016 09:44:53 -0800 Subject: [PATCH 0605/3220] UPSTREAM: net: avoid signed overflows for SO_{SND|RCV}BUFFORCE (cherry picked from commit b98b0bc8c431e3ceb4b26b0dfc8db509518fb290) CAP_NET_ADMIN users should not be allowed to set negative sk_sndbuf or sk_rcvbuf values, as it can lead to various memory corruptions, crashes, OOM... Note that before commit 82981930125a ("net: cleanups in sock_setsockopt()"), the bug was even more serious, since SO_SNDBUF and SO_RCVBUF were vulnerable. This needs to be backported to all known linux kernels. Again, many thanks to syzkaller team for discovering this gem. Signed-off-by: Eric Dumazet Reported-by: Andrey Konovalov Signed-off-by: David S. Miller Change-Id: I2b621c28c02267af5b34a379b2970fe5fb61a4f6 Bug: 33363517 --- net/core/sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index d0f83260cddd..3334c4bf62cd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -745,7 +745,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, val = min_t(u32, val, sysctl_wmem_max); set_sndbuf: sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF); + sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; @@ -781,7 +781,7 @@ set_rcvbuf: * returning the value we actually used in getsockopt * is the most desirable behavior. */ - sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF); + sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); break; case SO_RCVBUFFORCE: From 56ea977b0fc3d2e7e68e5a93a2b44aa3e25e34ed Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 16 May 2016 17:28:16 +0800 Subject: [PATCH 0606/3220] UPSTREAM: netlink: Fix dump skb leak/double free (cherry picked from commit 92964c79b357efd980812c4de5c1fd2ec8bb5520) When we free cb->skb after a dump, we do it after releasing the lock. This means that a new dump could have started in the time being and we'll end up freeing their skb instead of ours. This patch saves the skb and module before we unlock so we free the right memory. Fixes: 16b304f3404f ("netlink: Eliminate kmalloc in netlink dump operation.") Reported-by: Baozeng Ding Signed-off-by: Herbert Xu Acked-by: Cong Wang Signed-off-by: David S. Miller Change-Id: Ie2db6a32a49686c6d22c4a88c251b288343c7813 Bug: 33393474 --- net/netlink/af_netlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 59651af8cc27..ff36e88d19d9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2784,6 +2784,7 @@ static int netlink_dump(struct sock *sk) struct netlink_callback *cb; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; + struct module *module; int len, err = -ENOBUFS; int alloc_min_size; int alloc_size; @@ -2863,9 +2864,11 @@ static int netlink_dump(struct sock *sk) cb->done(cb); nlk->cb_running = false; + module = cb->module; + skb = cb->skb; mutex_unlock(nlk->cb_mutex); - module_put(cb->module); - consume_skb(cb->skb); + module_put(module); + consume_skb(skb); return 0; errout_skb: From a11f71fcd94d5315bc6fbfe74ec6c5cbb19104d1 Mon Sep 17 00:00:00 2001 From: Philip Pettersson Date: Wed, 30 Nov 2016 14:55:36 -0800 Subject: [PATCH 0607/3220] UPSTREAM: packet: fix race condition in packet_set_ring (cherry picked from commit 84ac7260236a49c79eede91617700174c2c19b0c) When packet_set_ring creates a ring buffer it will initialize a struct timer_list if the packet version is TPACKET_V3. This value can then be raced by a different thread calling setsockopt to set the version to TPACKET_V1 before packet_set_ring has finished. This leads to a use-after-free on a function pointer in the struct timer_list when the socket is closed as the previously initialized timer will not be deleted. The bug is fixed by taking lock_sock(sk) in packet_setsockopt when changing the packet version while also taking the lock at the start of packet_set_ring. Fixes: f6fb8f100b80 ("af-packet: TPACKET_V3 flexible buffer implementation.") Signed-off-by: Philip Pettersson Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Change-Id: Ice451620ecf2c2a5ba3709f45fbb5f3f5c5bb389 Bug: 33358926 --- net/packet/af_packet.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 992396aa635c..caffff01f410 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3577,19 +3577,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) - return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: case TPACKET_V2: case TPACKET_V3: - po->tp_version = val; - return 0; + break; default: return -EINVAL; } + lock_sock(sk); + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { + ret = -EBUSY; + } else { + po->tp_version = val; + ret = 0; + } + release_sock(sk); + return ret; } case PACKET_RESERVE: { @@ -4071,6 +4077,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; + lock_sock(sk); /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { WARN(1, "Tx-ring is not supported.\n"); @@ -4152,7 +4159,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; } - lock_sock(sk); /* Detach socket from network */ spin_lock(&po->bind_lock); @@ -4201,11 +4207,11 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); } - release_sock(sk); if (pg_vec) free_pg_vec(pg_vec, order, req->tp_block_nr); out: + release_sock(sk); return err; } From f4e6542320b9ce98a4598bf168b0b4d12c7c2b65 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 18 Nov 2016 22:13:00 +0100 Subject: [PATCH 0608/3220] UPSTREAM: l2tp: fix racy SOCK_ZAPPED flag check in l2tp_ip{,6}_bind() (cherry picked from commit 32c231164b762dddefa13af5a0101032c70b50ef) Lock socket before checking the SOCK_ZAPPED flag in l2tp_ip6_bind(). Without lock, a concurrent call could modify the socket flags between the sock_flag(sk, SOCK_ZAPPED) test and the lock_sock() call. This way, a socket could be inserted twice in l2tp_ip6_bind_table. Releasing it would then leave a stale pointer there, generating use-after-free errors when walking through the list or modifying adjacent entries. BUG: KASAN: use-after-free in l2tp_ip6_close+0x22e/0x290 at addr ffff8800081b0ed8 Write of size 8 by task syz-executor/10987 CPU: 0 PID: 10987 Comm: syz-executor Not tainted 4.8.0+ #39 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014 ffff880031d97838 ffffffff829f835b ffff88001b5a1640 ffff8800081b0ec0 ffff8800081b15a0 ffff8800081b6d20 ffff880031d97860 ffffffff8174d3cc ffff880031d978f0 ffff8800081b0e80 ffff88001b5a1640 ffff880031d978e0 Call Trace: [] dump_stack+0xb3/0x118 lib/dump_stack.c:15 [] kasan_object_err+0x1c/0x70 mm/kasan/report.c:156 [< inline >] print_address_description mm/kasan/report.c:194 [] kasan_report_error+0x1f6/0x4d0 mm/kasan/report.c:283 [< inline >] kasan_report mm/kasan/report.c:303 [] __asan_report_store8_noabort+0x3e/0x40 mm/kasan/report.c:329 [< inline >] __write_once_size ./include/linux/compiler.h:249 [< inline >] __hlist_del ./include/linux/list.h:622 [< inline >] hlist_del_init ./include/linux/list.h:637 [] l2tp_ip6_close+0x22e/0x290 net/l2tp/l2tp_ip6.c:239 [] inet_release+0xed/0x1c0 net/ipv4/af_inet.c:415 [] inet6_release+0x50/0x70 net/ipv6/af_inet6.c:422 [] sock_release+0x8d/0x1d0 net/socket.c:570 [] sock_close+0x16/0x20 net/socket.c:1017 [] __fput+0x28c/0x780 fs/file_table.c:208 [] ____fput+0x15/0x20 fs/file_table.c:244 [] task_work_run+0xf9/0x170 [] do_exit+0x85e/0x2a00 [] do_group_exit+0x108/0x330 [] get_signal+0x617/0x17a0 kernel/signal.c:2307 [] do_signal+0x7f/0x18f0 [] exit_to_usermode_loop+0xbf/0x150 arch/x86/entry/common.c:156 [< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:190 [] syscall_return_slowpath+0x1a0/0x1e0 arch/x86/entry/common.c:259 [] entry_SYSCALL_64_fastpath+0xc4/0xc6 Object at ffff8800081b0ec0, in cache L2TP/IPv6 size: 1448 Allocated: PID = 10987 [ 1116.897025] [] save_stack_trace+0x16/0x20 [ 1116.897025] [] save_stack+0x46/0xd0 [ 1116.897025] [] kasan_kmalloc+0xad/0xe0 [ 1116.897025] [] kasan_slab_alloc+0x12/0x20 [ 1116.897025] [< inline >] slab_post_alloc_hook mm/slab.h:417 [ 1116.897025] [< inline >] slab_alloc_node mm/slub.c:2708 [ 1116.897025] [< inline >] slab_alloc mm/slub.c:2716 [ 1116.897025] [] kmem_cache_alloc+0xc8/0x2b0 mm/slub.c:2721 [ 1116.897025] [] sk_prot_alloc+0x69/0x2b0 net/core/sock.c:1326 [ 1116.897025] [] sk_alloc+0x38/0xae0 net/core/sock.c:1388 [ 1116.897025] [] inet6_create+0x2d7/0x1000 net/ipv6/af_inet6.c:182 [ 1116.897025] [] __sock_create+0x37b/0x640 net/socket.c:1153 [ 1116.897025] [< inline >] sock_create net/socket.c:1193 [ 1116.897025] [< inline >] SYSC_socket net/socket.c:1223 [ 1116.897025] [] SyS_socket+0xef/0x1b0 net/socket.c:1203 [ 1116.897025] [] entry_SYSCALL_64_fastpath+0x23/0xc6 Freed: PID = 10987 [ 1116.897025] [] save_stack_trace+0x16/0x20 [ 1116.897025] [] save_stack+0x46/0xd0 [ 1116.897025] [] kasan_slab_free+0x71/0xb0 [ 1116.897025] [< inline >] slab_free_hook mm/slub.c:1352 [ 1116.897025] [< inline >] slab_free_freelist_hook mm/slub.c:1374 [ 1116.897025] [< inline >] slab_free mm/slub.c:2951 [ 1116.897025] [] kmem_cache_free+0xc8/0x330 mm/slub.c:2973 [ 1116.897025] [< inline >] sk_prot_free net/core/sock.c:1369 [ 1116.897025] [] __sk_destruct+0x32b/0x4f0 net/core/sock.c:1444 [ 1116.897025] [] sk_destruct+0x44/0x80 net/core/sock.c:1452 [ 1116.897025] [] __sk_free+0x53/0x220 net/core/sock.c:1460 [ 1116.897025] [] sk_free+0x23/0x30 net/core/sock.c:1471 [ 1116.897025] [] sk_common_release+0x28c/0x3e0 ./include/net/sock.h:1589 [ 1116.897025] [] l2tp_ip6_close+0x1fe/0x290 net/l2tp/l2tp_ip6.c:243 [ 1116.897025] [] inet_release+0xed/0x1c0 net/ipv4/af_inet.c:415 [ 1116.897025] [] inet6_release+0x50/0x70 net/ipv6/af_inet6.c:422 [ 1116.897025] [] sock_release+0x8d/0x1d0 net/socket.c:570 [ 1116.897025] [] sock_close+0x16/0x20 net/socket.c:1017 [ 1116.897025] [] __fput+0x28c/0x780 fs/file_table.c:208 [ 1116.897025] [] ____fput+0x15/0x20 fs/file_table.c:244 [ 1116.897025] [] task_work_run+0xf9/0x170 [ 1116.897025] [] do_exit+0x85e/0x2a00 [ 1116.897025] [] do_group_exit+0x108/0x330 [ 1116.897025] [] get_signal+0x617/0x17a0 kernel/signal.c:2307 [ 1116.897025] [] do_signal+0x7f/0x18f0 [ 1116.897025] [] exit_to_usermode_loop+0xbf/0x150 arch/x86/entry/common.c:156 [ 1116.897025] [< inline >] prepare_exit_to_usermode arch/x86/entry/common.c:190 [ 1116.897025] [] syscall_return_slowpath+0x1a0/0x1e0 arch/x86/entry/common.c:259 [ 1116.897025] [] entry_SYSCALL_64_fastpath+0xc4/0xc6 Memory state around the buggy address: ffff8800081b0d80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8800081b0e00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff8800081b0e80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ^ ffff8800081b0f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8800081b0f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== The same issue exists with l2tp_ip_bind() and l2tp_ip_bind_table. Fixes: c51ce49735c1 ("l2tp: fix oops in L2TP IP sockets for connect() AF_UNSPEC case") Reported-by: Baozeng Ding Reported-by: Andrey Konovalov Tested-by: Baozeng Ding Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller Change-Id: I74b0e6bf0d0a5e0e2f4d8a3c6e52ea75a572b114 Bug: 33753815 --- net/l2tp/l2tp_ip.c | 5 +++-- net/l2tp/l2tp_ip6.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index ec22078b0914..70a0f8083a3b 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -249,8 +249,6 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) int ret; int chk_addr_ret; - if (!sock_flag(sk, SOCK_ZAPPED)) - return -EINVAL; if (addr_len < sizeof(struct sockaddr_l2tpip)) return -EINVAL; if (addr->l2tp_family != AF_INET) @@ -265,6 +263,9 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) read_unlock_bh(&l2tp_ip_lock); lock_sock(sk); + if (!sock_flag(sk, SOCK_ZAPPED)) + goto out; + if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip)) goto out; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index c14c59f18c59..b70e70776563 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -264,8 +264,6 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) int addr_type; int err; - if (!sock_flag(sk, SOCK_ZAPPED)) - return -EINVAL; if (addr->l2tp_family != AF_INET6) return -EINVAL; if (addr_len < sizeof(*addr)) @@ -291,6 +289,9 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) lock_sock(sk); err = -EINVAL; + if (!sock_flag(sk, SOCK_ZAPPED)) + goto out_unlock; + if (sk->sk_state != TCP_CLOSE) goto out_unlock; From 7f18f0963d81a096e741cfc14ec9c2915f633e0a Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 9 Jan 2017 17:20:11 +0000 Subject: [PATCH 0609/3220] DEBUG: sched/fair: Fix missing sched_load_avg_cpu events update_cfs_rq_load_avg is called from update_blocked_averages without triggering the sched_load_avg_cpu event. Move the event trigger to inside update_cfs_rq_load_avg to avoid this missing event. Change-Id: I6c4f66f687a644e4e7f798db122d28a8f5919b7b Signed-off-by: Brendan Jackman --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b881cf81d79..aeb9b550470b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2726,6 +2726,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif + trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + return decayed || removed; } @@ -2749,7 +2751,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (entity_is_task(se)) trace_sched_load_avg_task(task_of(se), &se->avg); - trace_sched_load_avg_cpu(cpu, cfs_rq); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From 1cb392e10307ba3ef7d9a602e59e54ae3b6399ad Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 10 Jan 2017 11:31:01 +0000 Subject: [PATCH 0610/3220] DEBUG: sched/fair: Fix sched_load_avg_cpu events for task_groups The current sched_load_avg_cpu event traces the load for any cfs_rq that is updated. This is not representative of the CPU load - instead we should only trace this event when the cfs_rq being updated is in the root_task_group. Change-Id: I345c2f13f6b5718cb4a89beb247f7887ce97ed6b Signed-off-by: Brendan Jackman --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aeb9b550470b..7d4151601860 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2726,7 +2726,9 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */ + if (cfs_rq == &rq_of(cfs_rq)->cfs) + trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); return decayed || removed; } From 82a6fedb6ea53c9f3eb00ab1e0b839d3fcfe5fc2 Mon Sep 17 00:00:00 2001 From: Luca Stefani Date: Fri, 13 Jan 2017 16:09:57 +0100 Subject: [PATCH 0611/3220] arm64: kernel: Fix build warning * Issue: After da643dc .enable expects an int -> Change cpu_enable_uao protype to int Change-Id: I581a1afd13cd90637e6a7a315ede2af8950b401f Signed-off-by: Luca Stefani --- arch/arm64/include/asm/processor.h | 2 +- arch/arm64/mm/fault.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 83d0aa97b577..4be934fde409 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -191,6 +191,6 @@ static inline void spin_lock_prefetch(const void *ptr) #endif int cpu_enable_pan(void *__unused); -void cpu_enable_uao(void *__unused); +int cpu_enable_uao(void *__unused); #endif /* __ASM_PROCESSOR_H */ diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 2c0a80ca3536..2581ede3075a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -658,8 +658,9 @@ int cpu_enable_pan(void *__unused) * We need to enable the feature at runtime (instead of adding it to * PSR_MODE_EL1h) as the feature may not be implemented by the cpu. */ -void cpu_enable_uao(void *__unused) +int cpu_enable_uao(void *__unused) { asm(SET_PSTATE_UAO(1)); + return 0; } #endif /* CONFIG_ARM64_UAO */ From 0ec03f8457996ad0067ae1f62b70a092f726bb95 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 13 Apr 2016 16:38:34 -0700 Subject: [PATCH 0612/3220] ANDROID: sdcardfs: override umask on mkdir and create The mode on files created on the lower fs should not be affected by the umask of the calling task's fs_struct. Instead, we create a copy and modify it as needed. This also lets us avoid the string shenanigans around .nomedia files. Bug: 27992761 Change-Id: Ia3a6e56c24c6e19b3b01c1827e46403bb71c2f4c Signed-off-by: Daniel Rosenberg --- fs/fs_struct.c | 1 + fs/sdcardfs/inode.c | 70 ++++++++++++++++++++++----------------------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 7dca743b2ce1..005dcb401369 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -127,6 +127,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) } return fs; } +EXPORT_SYMBOL_GPL(copy_fs_struct); int unshare_fs_struct(void) { diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 2528da0d3ae1..4b140ba86955 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -19,6 +19,7 @@ */ #include "sdcardfs.h" +#include /* Do not directly use this function. Use OVERRIDE_CRED() instead. */ const struct cred * override_fsids(struct sdcardfs_sb_info* sbi) @@ -56,6 +57,8 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, struct dentry *lower_parent_dentry = NULL; struct path lower_path; const struct cred *saved_cred = NULL; + struct fs_struct *saved_fs; + struct fs_struct *copied_fs; if(!check_caller_access_to_name(dir, dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" @@ -74,6 +77,16 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, /* set last 16bytes of mode field to 0664 */ mode = (mode & S_IFMT) | 00664; + + /* temporarily change umask for lower fs write */ + saved_fs = current->fs; + copied_fs = copy_fs_struct(current->fs); + if (!copied_fs) { + err = -ENOMEM; + goto out_unlock; + } + current->fs = copied_fs; + current->fs->umask = 0; err = vfs_create(d_inode(lower_parent_dentry), lower_dentry, mode, want_excl); if (err) goto out; @@ -85,6 +98,9 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry)); out: + current->fs = saved_fs; + free_fs_struct(copied_fs); +out_unlock: unlock_dir(lower_parent_dentry); sdcardfs_put_lower_path(dentry, &lower_path); REVERT_CRED(saved_cred); @@ -245,11 +261,9 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); const struct cred *saved_cred = NULL; struct sdcardfs_inode_info *pi = SDCARDFS_I(dir); - char *page_buf; - char *nomedia_dir_name; - char *nomedia_fullpath; - int fullpath_namelen; int touch_err = 0; + struct fs_struct *saved_fs; + struct fs_struct *copied_fs; if(!check_caller_access_to_name(dir, dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" @@ -276,6 +290,16 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode /* set last 16bytes of mode field to 0775 */ mode = (mode & S_IFMT) | 00775; + + /* temporarily change umask for lower fs write */ + saved_fs = current->fs; + copied_fs = copy_fs_struct(current->fs); + if (!copied_fs) { + err = -ENOMEM; + goto out_unlock; + } + current->fs = copied_fs; + current->fs->umask = 0; err = vfs_mkdir(d_inode(lower_parent_dentry), lower_dentry, mode); if (err) @@ -316,42 +340,18 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode /* When creating /Android/data and /Android/obb, mark them as .nomedia */ if (make_nomedia_in_obb || ((pi->perm == PERM_ANDROID) && (!strcasecmp(dentry->d_name.name, "data")))) { - - page_buf = (char *)__get_free_page(GFP_KERNEL); - if (!page_buf) { - printk(KERN_ERR "sdcardfs: failed to allocate page buf\n"); - goto out; - } - - nomedia_dir_name = d_absolute_path(&lower_path, page_buf, PAGE_SIZE); - if (IS_ERR(nomedia_dir_name)) { - free_page((unsigned long)page_buf); - printk(KERN_ERR "sdcardfs: failed to get .nomedia dir name\n"); - goto out; - } - - fullpath_namelen = page_buf + PAGE_SIZE - nomedia_dir_name - 1; - fullpath_namelen += strlen("/.nomedia"); - nomedia_fullpath = kzalloc(fullpath_namelen + 1, GFP_KERNEL); - if (!nomedia_fullpath) { - free_page((unsigned long)page_buf); - printk(KERN_ERR "sdcardfs: failed to allocate .nomedia fullpath buf\n"); - goto out; - } - - strcpy(nomedia_fullpath, nomedia_dir_name); - free_page((unsigned long)page_buf); - strcat(nomedia_fullpath, "/.nomedia"); - touch_err = touch(nomedia_fullpath, 0664); + set_fs_pwd(current->fs, &lower_path); + touch_err = touch(".nomedia", 0664); if (touch_err) { - printk(KERN_ERR "sdcardfs: failed to touch(%s): %d\n", - nomedia_fullpath, touch_err); - kfree(nomedia_fullpath); + printk(KERN_ERR "sdcardfs: failed to create .nomedia in %s: %d\n", + lower_path.dentry->d_name.name, touch_err); goto out; } - kfree(nomedia_fullpath); } out: + current->fs = saved_fs; + free_fs_struct(copied_fs); +out_unlock: unlock_dir(lower_parent_dentry); sdcardfs_put_lower_path(dentry, &lower_path); out_revert: From cc16efd13c3015889efe0bfe4f2cb5f4020ec4ca Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 27 Apr 2016 15:31:29 -0700 Subject: [PATCH 0613/3220] ANDROID: sdcardfs: Check for other cases on path lookup This fixes a bug where the first lookup of a file or folder created under a different view would not be case insensitive. It will now search through for a case insensitive match if the initial lookup fails. Bug:28024488 Change-Id: I4ff9ce297b9f2f9864b47540e740fd491c545229 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/lookup.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index a01b06a514fd..a127d05b5054 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -240,6 +240,28 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, /* Use vfs_path_lookup to check if the dentry exists or not */ err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, 0, &lower_path); + /* check for other cases */ + if (err == -ENOENT) { + struct dentry *child; + struct dentry *match = NULL; + spin_lock(&lower_dir_dentry->d_lock); + list_for_each_entry(child, &lower_dir_dentry->d_subdirs, d_child) { + if (child && d_inode(child)) { + if (strcasecmp(child->d_name.name, name)==0) { + match = dget(child); + break; + } + } + } + spin_unlock(&lower_dir_dentry->d_lock); + if (match) { + err = vfs_path_lookup(lower_dir_dentry, + lower_dir_mnt, + match->d_name.name, 0, + &lower_path); + dput(match); + } + } /* no error: handle positive dentries */ if (!err) { From 96acdab2f4de226a253abbc6f7bab149bb82c4bf Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 16 Aug 2016 15:19:26 -0700 Subject: [PATCH 0614/3220] ANDROID: sdcardfs: Fix locking for permission fix up Iterating over d_subdirs requires taking d_lock. Removed several unneeded locks. Change-Id: I5b1588e54c7e6ee19b756d6705171c7f829e2650 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/derived_perm.c | 6 ++---- fs/sdcardfs/inode.c | 2 -- fs/sdcardfs/lookup.c | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 41e0e11b3c35..bfe402b8cf32 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -111,15 +111,15 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry) void get_derive_permissions_recursive(struct dentry *parent) { struct dentry *dentry; + spin_lock(&parent->d_lock); list_for_each_entry(dentry, &parent->d_subdirs, d_child) { if (dentry->d_inode) { - mutex_lock(&dentry->d_inode->i_mutex); get_derived_permission(parent, dentry); fix_derived_permission(dentry->d_inode); get_derive_permissions_recursive(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); } } + spin_unlock(&parent->d_lock); } /* main function for updating derived permission */ @@ -135,7 +135,6 @@ inline void update_derived_permission_lock(struct dentry *dentry) * 1. need to check whether the dentry is updated or not * 2. remove the root dentry update */ - mutex_lock(&dentry->d_inode->i_mutex); if(IS_ROOT(dentry)) { //setup_default_pre_root_state(dentry->d_inode); } else { @@ -146,7 +145,6 @@ inline void update_derived_permission_lock(struct dentry *dentry) } } fix_derived_permission(dentry->d_inode); - mutex_unlock(&dentry->d_inode->i_mutex); } int need_graft_path(struct dentry *dentry) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 4b140ba86955..1a23c0cc8f58 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -513,11 +513,9 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, } /* At this point, not all dentry information has been moved, so * we pass along new_dentry for the name.*/ - mutex_lock(&d_inode(old_dentry)->i_mutex); get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry); fix_derived_permission(d_inode(old_dentry)); get_derive_permissions_recursive(old_dentry); - mutex_unlock(&d_inode(old_dentry)->i_mutex); out: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_old_dir_dentry); diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index a127d05b5054..c74a7d1bc18e 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -387,11 +387,9 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, if (dentry->d_inode) { fsstack_copy_attr_times(dentry->d_inode, sdcardfs_lower_inode(dentry->d_inode)); - /* get drived permission */ - mutex_lock(&dentry->d_inode->i_mutex); + /* get derived permission */ get_derived_permission(parent, dentry); fix_derived_permission(dentry->d_inode); - mutex_unlock(&dentry->d_inode->i_mutex); } /* update parent directory's atime */ fsstack_copy_attr_atime(parent->d_inode, From 1fb590f2d5ada66a89b82c7657b3002892c966b6 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 10 May 2016 13:42:43 -0700 Subject: [PATCH 0615/3220] ANDROID: sdcardfs: Switch package list to RCU Switched the package id hashmap to use RCU. Change-Id: I9fdcab279009005bf28536247d11e13babab0b93 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/derived_perm.c | 3 +- fs/sdcardfs/packagelist.c | 212 +++++++++++++++++-------------------- fs/sdcardfs/sdcardfs.h | 2 +- 3 files changed, 99 insertions(+), 118 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index bfe402b8cf32..2a75ad873a7c 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -47,7 +47,6 @@ void setup_derived_state(struct inode *inode, perm_t perm, /* While renaming, there is a point where we want the path from dentry, but the name from newdentry */ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry) { - struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); struct sdcardfs_inode_info *info = SDCARDFS_I(dentry->d_inode); struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode); appid_t appid; @@ -96,7 +95,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st case PERM_ANDROID_DATA: case PERM_ANDROID_OBB: case PERM_ANDROID_MEDIA: - appid = get_appid(sbi->pkgl_id, newdentry->d_name.name); + appid = get_appid(newdentry->d_name.name); if (appid != 0) { info->d_uid = multiuser_get_uid(parent_info->userid, appid); } diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 9c3340528eee..f5a49c513568 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -29,26 +29,13 @@ #include -#define STRING_BUF_SIZE (512) - struct hashtable_entry { struct hlist_node hlist; - void *key; - unsigned int value; + const char *key; + atomic_t value; }; -struct sb_list { - struct super_block *sb; - struct list_head list; -}; - -struct packagelist_data { - DECLARE_HASHTABLE(package_to_appid,8); - struct mutex hashtable_lock; - -}; - -static struct packagelist_data *pkgl_data_all; +static DEFINE_HASHTABLE(package_to_appid, 8); static struct kmem_cache *hashtable_entry_cachep; @@ -64,22 +51,21 @@ static unsigned int str_hash(const char *key) { return h; } -appid_t get_appid(void *pkgl_id, const char *app_name) +appid_t get_appid(const char *app_name) { - struct packagelist_data *pkgl_dat = pkgl_data_all; struct hashtable_entry *hash_cur; unsigned int hash = str_hash(app_name); appid_t ret_id; - mutex_lock(&pkgl_dat->hashtable_lock); - hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) { + rcu_read_lock(); + hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { if (!strcasecmp(app_name, hash_cur->key)) { - ret_id = (appid_t)hash_cur->value; - mutex_unlock(&pkgl_dat->hashtable_lock); + ret_id = atomic_read(&hash_cur->value); + rcu_read_unlock(); return ret_id; } } - mutex_unlock(&pkgl_dat->hashtable_lock); + rcu_read_unlock(); return 0; } @@ -120,116 +106,118 @@ int open_flags_to_access_mode(int open_flags) { } } -static int insert_str_to_int_lock(struct packagelist_data *pkgl_dat, char *key, - unsigned int value) +static struct hashtable_entry *alloc_packagelist_entry(const char *key, + appid_t value) +{ + struct hashtable_entry *ret = kmem_cache_alloc(hashtable_entry_cachep, + GFP_KERNEL); + if (!ret) + return NULL; + + ret->key = kstrdup(key, GFP_KERNEL); + if (!ret->key) { + kmem_cache_free(hashtable_entry_cachep, ret); + return NULL; + } + + atomic_set(&ret->value, value); + return ret; +} + +static int insert_packagelist_entry_locked(const char *key, appid_t value) { struct hashtable_entry *hash_cur; struct hashtable_entry *new_entry; unsigned int hash = str_hash(key); - hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) { + hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { if (!strcasecmp(key, hash_cur->key)) { - hash_cur->value = value; + atomic_set(&hash_cur->value, value); return 0; } } - new_entry = kmem_cache_alloc(hashtable_entry_cachep, GFP_KERNEL); + new_entry = alloc_packagelist_entry(key, value); if (!new_entry) return -ENOMEM; - new_entry->key = kstrdup(key, GFP_KERNEL); - new_entry->value = value; - hash_add(pkgl_dat->package_to_appid, &new_entry->hlist, hash); + hash_add_rcu(package_to_appid, &new_entry->hlist, hash); return 0; } static void fixup_perms(struct super_block *sb) { if (sb && sb->s_magic == SDCARDFS_SUPER_MAGIC) { - mutex_lock(&sb->s_root->d_inode->i_mutex); get_derive_permissions_recursive(sb->s_root); - mutex_unlock(&sb->s_root->d_inode->i_mutex); } } -static int insert_str_to_int(struct packagelist_data *pkgl_dat, char *key, - unsigned int value) { - int ret; - struct sdcardfs_sb_info *sbinfo; - mutex_lock(&sdcardfs_super_list_lock); - mutex_lock(&pkgl_dat->hashtable_lock); - ret = insert_str_to_int_lock(pkgl_dat, key, value); - mutex_unlock(&pkgl_dat->hashtable_lock); - - list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { - if (sbinfo) { - fixup_perms(sbinfo->sb); - } - } - mutex_unlock(&sdcardfs_super_list_lock); - return ret; -} - -static void remove_str_to_int_lock(struct hashtable_entry *h_entry) { - kfree(h_entry->key); - hash_del(&h_entry->hlist); - kmem_cache_free(hashtable_entry_cachep, h_entry); -} - -static void remove_str_to_int(struct packagelist_data *pkgl_dat, const char *key) +static void fixup_all_perms(void) { struct sdcardfs_sb_info *sbinfo; + list_for_each_entry(sbinfo, &sdcardfs_super_list, list) + if (sbinfo) + fixup_perms(sbinfo->sb); +} + +static int insert_packagelist_entry(const char *key, appid_t value) +{ + int err; + + mutex_lock(&sdcardfs_super_list_lock); + err = insert_packagelist_entry_locked(key, value); + if (!err) + fixup_all_perms(); + mutex_unlock(&sdcardfs_super_list_lock); + + return err; +} + +static void free_packagelist_entry(struct hashtable_entry *entry) +{ + kfree(entry->key); + hash_del_rcu(&entry->hlist); + kmem_cache_free(hashtable_entry_cachep, entry); +} + +static void remove_packagelist_entry_locked(const char *key) +{ struct hashtable_entry *hash_cur; unsigned int hash = str_hash(key); - mutex_lock(&sdcardfs_super_list_lock); - mutex_lock(&pkgl_dat->hashtable_lock); - hash_for_each_possible(pkgl_dat->package_to_appid, hash_cur, hlist, hash) { + + hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { if (!strcasecmp(key, hash_cur->key)) { - remove_str_to_int_lock(hash_cur); - break; - } - } - mutex_unlock(&pkgl_dat->hashtable_lock); - list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { - if (sbinfo) { - fixup_perms(sbinfo->sb); + hash_del_rcu(&hash_cur->hlist); + synchronize_rcu(); + free_packagelist_entry(hash_cur); + return; } } +} + +static void remove_packagelist_entry(const char *key) +{ + mutex_lock(&sdcardfs_super_list_lock); + remove_packagelist_entry_locked(key); + fixup_all_perms(); mutex_unlock(&sdcardfs_super_list_lock); return; } -static void remove_all_hashentrys(struct packagelist_data *pkgl_dat) +static void packagelist_destroy(void) { struct hashtable_entry *hash_cur; struct hlist_node *h_t; + HLIST_HEAD(free_list); int i; - mutex_lock(&pkgl_dat->hashtable_lock); - hash_for_each_safe(pkgl_dat->package_to_appid, i, h_t, hash_cur, hlist) - remove_str_to_int_lock(hash_cur); - mutex_unlock(&pkgl_dat->hashtable_lock); - hash_init(pkgl_dat->package_to_appid); -} + mutex_lock(&sdcardfs_super_list_lock); + hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) { + hash_del_rcu(&hash_cur->hlist); + hlist_add_head(&hash_cur->hlist, &free_list); -static struct packagelist_data * packagelist_create(void) -{ - struct packagelist_data *pkgl_dat; - - pkgl_dat = kmalloc(sizeof(*pkgl_dat), GFP_KERNEL | __GFP_ZERO); - if (!pkgl_dat) { - printk(KERN_ERR "sdcardfs: Failed to create hash\n"); - return ERR_PTR(-ENOMEM); } - - mutex_init(&pkgl_dat->hashtable_lock); - hash_init(pkgl_dat->package_to_appid); - - return pkgl_dat; -} - -static void packagelist_destroy(struct packagelist_data *pkgl_dat) -{ - remove_all_hashentrys(pkgl_dat); + synchronize_rcu(); + hlist_for_each_entry_safe(hash_cur, h_t, &free_list, hlist) + free_packagelist_entry(hash_cur); + mutex_unlock(&sdcardfs_super_list_lock); printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n"); - kfree(pkgl_dat); } struct package_appid { @@ -245,26 +233,21 @@ static inline struct package_appid *to_package_appid(struct config_item *item) static ssize_t package_appid_attr_show(struct config_item *item, char *page) { - ssize_t count; - count = sprintf(page, "%d\n", get_appid(pkgl_data_all, item->ci_name)); - return count; + return scnprintf(page, PAGE_SIZE, "%u\n", get_appid(item->ci_name)); } static ssize_t package_appid_attr_store(struct config_item *item, const char *page, size_t count) { struct package_appid *package_appid = to_package_appid(item); - unsigned long tmp; - char *p = (char *) page; + unsigned int tmp; int ret; - tmp = simple_strtoul(p, &p, 10); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtouint(page, 10, &tmp); + if (ret) + return ret; - if (tmp > INT_MAX) - return -ERANGE; - ret = insert_str_to_int(pkgl_data_all, item->ci_name, (unsigned int)tmp); + ret = insert_packagelist_entry(item->ci_name, tmp); package_appid->add_pid = tmp; if (ret) return ret; @@ -289,7 +272,7 @@ static void package_appid_release(struct config_item *item) { printk(KERN_INFO "sdcardfs: removing %s\n", item->ci_dentry->d_name.name); /* item->ci_name is freed already, so we rely on the dentry */ - remove_str_to_int(pkgl_data_all, item->ci_dentry->d_name.name); + remove_packagelist_entry(item->ci_dentry->d_name.name); kfree(to_package_appid(item)); } @@ -333,21 +316,21 @@ static ssize_t packages_attr_show(struct config_item *item, char *page) { struct hashtable_entry *hash_cur; - struct hlist_node *h_t; int i; int count = 0, written = 0; - char errormsg[] = "\n"; + const char errormsg[] = "\n"; - mutex_lock(&pkgl_data_all->hashtable_lock); - hash_for_each_safe(pkgl_data_all->package_to_appid, i, h_t, hash_cur, hlist) { - written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n", (char *)hash_cur->key, hash_cur->value); + rcu_read_lock(); + hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) { + written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n", + (const char *)hash_cur->key, atomic_read(&hash_cur->value)); if (count + written == PAGE_SIZE - sizeof(errormsg)) { count += scnprintf(page + count, PAGE_SIZE - count, errormsg); break; } count += written; } - mutex_unlock(&pkgl_data_all->hashtable_lock); + rcu_read_unlock(); return count; } @@ -430,7 +413,6 @@ int packagelist_init(void) return -ENOMEM; } - pkgl_data_all = packagelist_create(); configfs_sdcardfs_init(); return 0; } @@ -438,7 +420,7 @@ int packagelist_init(void) void packagelist_exit(void) { configfs_sdcardfs_exit(); - packagelist_destroy(pkgl_data_all); + packagelist_destroy(); if (hashtable_entry_cachep) kmem_cache_destroy(hashtable_entry_cachep); } diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index f111f898b630..75284f339ae0 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -396,7 +396,7 @@ extern struct mutex sdcardfs_super_list_lock; extern struct list_head sdcardfs_super_list; /* for packagelist.c */ -extern appid_t get_appid(void *pkgl_id, const char *app_name); +extern appid_t get_appid(const char *app_name); extern int check_caller_access_to_name(struct inode *parent_node, const char* name); extern int open_flags_to_access_mode(int open_flags); extern int packagelist_init(void); From 640e5265032d2e36c8933f09657f34946fbf8259 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 18 May 2016 16:57:10 -0700 Subject: [PATCH 0616/3220] ANDROID: sdcardfs: Added top to sdcardfs_inode_info Adding packages to the package list and moving files takes a large amount of locks, and is currently a heavy operation. This adds a 'top' field to the inode_info, which points to the inode for the top most directory whose owner you would like to match. On permission checks and get_attr, we look up the owner based on the information at top. When we change a package mapping, we need only modify the information in the corresponding top inode_info's. When renaming, we must ensure top is set correctly in all children. This happens when an app specific folder gets moved outside of the folder for that app. Change-Id: Ib749c60b568e9a45a46f8ceed985c1338246ec6c Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/derived_perm.c | 73 ++++++++++++++++++++++++++++++++++---- fs/sdcardfs/inode.c | 45 +++++++++++++++++++---- fs/sdcardfs/main.c | 4 +-- fs/sdcardfs/packagelist.c | 12 +++---- fs/sdcardfs/sdcardfs.h | 40 ++++++++++++++++++--- fs/sdcardfs/super.c | 1 + 6 files changed, 149 insertions(+), 26 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 2a75ad873a7c..89daf69efbaa 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -30,11 +30,12 @@ static void inherit_derived_state(struct inode *parent, struct inode *child) ci->userid = pi->userid; ci->d_uid = pi->d_uid; ci->under_android = pi->under_android; + set_top(ci, pi->top); } /* helper function for derived state */ -void setup_derived_state(struct inode *inode, perm_t perm, - userid_t userid, uid_t uid, bool under_android) +void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, + uid_t uid, bool under_android, struct inode *top) { struct sdcardfs_inode_info *info = SDCARDFS_I(inode); @@ -42,6 +43,7 @@ void setup_derived_state(struct inode *inode, perm_t perm, info->userid = userid; info->d_uid = uid; info->under_android = under_android; + set_top(info, top); } /* While renaming, there is a point where we want the path from dentry, but the name from newdentry */ @@ -70,6 +72,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st /* Legacy internal layout places users at top level */ info->perm = PERM_ROOT; info->userid = simple_strtoul(newdentry->d_name.name, NULL, 10); + set_top(info, &info->vfs_inode); break; case PERM_ROOT: /* Assume masked off by default. */ @@ -77,19 +80,23 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID; info->under_android = true; + set_top(info, &info->vfs_inode); } break; case PERM_ANDROID: if (!strcasecmp(newdentry->d_name.name, "data")) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_DATA; + set_top(info, &info->vfs_inode); } else if (!strcasecmp(newdentry->d_name.name, "obb")) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_OBB; + set_top(info, &info->vfs_inode); /* Single OBB directory is always shared */ } else if (!strcasecmp(newdentry->d_name.name, "media")) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_MEDIA; + set_top(info, &info->vfs_inode); } break; case PERM_ANDROID_DATA: @@ -99,6 +106,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st if (appid != 0) { info->d_uid = multiuser_get_uid(parent_info->userid, appid); } + set_top(info, &info->vfs_inode); break; } } @@ -108,14 +116,65 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry) get_derived_permission_new(parent, dentry, dentry); } -void get_derive_permissions_recursive(struct dentry *parent) { +static int descendant_may_need_fixup(perm_t perm) { + if (perm == PERM_PRE_ROOT || perm == PERM_ROOT || perm == PERM_ANDROID) + return 1; + return 0; +} + +static int needs_fixup(perm_t perm) { + if (perm == PERM_ANDROID_DATA || perm == PERM_ANDROID_OBB + || perm == PERM_ANDROID_MEDIA) + return 1; + return 0; +} + +void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len) { + struct dentry *child; + struct sdcardfs_inode_info *info; + if (!dget(dentry)) + return; + if (!dentry->d_inode) { + dput(dentry); + return; + } + info = SDCARDFS_I(d_inode(dentry)); + + if (needs_fixup(info->perm)) { + mutex_lock(&d_inode(dentry)->i_mutex); + child = lookup_one_len(name, dentry, len); + mutex_unlock(&d_inode(dentry)->i_mutex); + if (!IS_ERR(child)) { + if (child->d_inode) { + get_derived_permission(dentry, child); + fix_derived_permission(d_inode(child)); + } + dput(child); + } + } else if (descendant_may_need_fixup(info->perm)) { + mutex_lock(&d_inode(dentry)->i_mutex); + list_for_each_entry(child, &dentry->d_subdirs, d_child) { + fixup_perms_recursive(child, name, len); + } + mutex_unlock(&d_inode(dentry)->i_mutex); + } + dput(dentry); +} + +void fixup_top_recursive(struct dentry *parent) { struct dentry *dentry; + struct sdcardfs_inode_info *info; + if (!d_inode(parent)) + return; + info = SDCARDFS_I(d_inode(parent)); spin_lock(&parent->d_lock); list_for_each_entry(dentry, &parent->d_subdirs, d_child) { - if (dentry->d_inode) { - get_derived_permission(parent, dentry); - fix_derived_permission(dentry->d_inode); - get_derive_permissions_recursive(dentry); + if (d_inode(dentry)) { + if (SDCARDFS_I(d_inode(parent))->top != SDCARDFS_I(d_inode(dentry))->top) { + get_derived_permission(parent, dentry); + fix_derived_permission(d_inode(dentry)); + fixup_top_recursive(dentry); + } } } spin_unlock(&parent->d_lock); diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 1a23c0cc8f58..67bcee2c379a 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -515,7 +515,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, * we pass along new_dentry for the name.*/ get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry); fix_derived_permission(d_inode(old_dentry)); - get_derive_permissions_recursive(old_dentry); + fixup_top_recursive(old_dentry); out: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_old_dir_dentry); @@ -587,6 +587,16 @@ static const char *sdcardfs_follow_link(struct dentry *dentry, void **cookie) static int sdcardfs_permission(struct inode *inode, int mask) { int err; + struct inode *top = grab_top(SDCARDFS_I(inode)); + + if (!top) + return -EINVAL; + /* Ensure owner is up to date */ + if (!uid_eq(inode->i_uid, top->i_uid)) { + SDCARDFS_I(inode)->d_uid = SDCARDFS_I(top)->d_uid; + fix_derived_permission(inode); + } + release_top(SDCARDFS_I(inode)); /* * Permission check on sdcardfs inode. @@ -725,6 +735,30 @@ out_err: return err; } +static int sdcardfs_fillattr(struct inode *inode, struct kstat *stat) +{ + struct sdcardfs_inode_info *info = SDCARDFS_I(inode); + struct inode *top = grab_top(info); + if (!top) + return -EINVAL; + + stat->dev = inode->i_sb->s_dev; + stat->ino = inode->i_ino; + stat->mode = (inode->i_mode & S_IFMT) | get_mode(SDCARDFS_I(top)); + stat->nlink = inode->i_nlink; + stat->uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid); + stat->gid = make_kgid(&init_user_ns, get_gid(SDCARDFS_I(top))); + stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blksize = (1 << inode->i_blkbits); + stat->blocks = inode->i_blocks; + release_top(info); + return 0; +} + static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { @@ -733,6 +767,7 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *lower_inode; struct path lower_path; struct dentry *parent; + int err; parent = dget_parent(dentry); if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) { @@ -750,14 +785,12 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, lower_dentry = lower_path.dentry; lower_inode = sdcardfs_lower_inode(inode); - sdcardfs_copy_and_fix_attrs(inode, lower_inode); fsstack_copy_inode_size(inode, lower_inode); - - generic_fillattr(inode, stat); + err = sdcardfs_fillattr(inode, stat); sdcardfs_put_lower_path(dentry, &lower_path); - return 0; + return err; } const struct inode_operations sdcardfs_symlink_iops = { @@ -775,9 +808,7 @@ const struct inode_operations sdcardfs_symlink_iops = { const struct inode_operations sdcardfs_dir_iops = { .create = sdcardfs_create, .lookup = sdcardfs_lookup, -#if 0 .permission = sdcardfs_permission, -#endif .unlink = sdcardfs_unlink, .mkdir = sdcardfs_mkdir, .rmdir = sdcardfs_rmdir, diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index a6522286d731..6d526bf3d956 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -268,13 +268,13 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); mutex_lock(&sdcardfs_super_list_lock); if(sb_info->options.multiuser) { - setup_derived_state(sb->s_root->d_inode, PERM_PRE_ROOT, sb_info->options.fs_user_id, AID_ROOT, false); + setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT, sb_info->options.fs_user_id, AID_ROOT, false, d_inode(sb->s_root)); snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name); /*err = prepare_dir(sb_info->obbpath_s, sb_info->options.fs_low_uid, sb_info->options.fs_low_gid, 00755);*/ } else { - setup_derived_state(sb->s_root->d_inode, PERM_ROOT, sb_info->options.fs_low_uid, AID_ROOT, false); + setup_derived_state(sb->s_root->d_inode, PERM_ROOT, sb_info->options.fs_low_uid, AID_ROOT, false, sb->s_root->d_inode); snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name); } fix_derived_permission(sb->s_root->d_inode); diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index f5a49c513568..03776fa5f26c 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -143,18 +143,18 @@ static int insert_packagelist_entry_locked(const char *key, appid_t value) return 0; } -static void fixup_perms(struct super_block *sb) { +static void fixup_perms(struct super_block *sb, const char *key) { if (sb && sb->s_magic == SDCARDFS_SUPER_MAGIC) { - get_derive_permissions_recursive(sb->s_root); + fixup_perms_recursive(sb->s_root, key, strlen(key)); } } -static void fixup_all_perms(void) +static void fixup_all_perms(const char *key) { struct sdcardfs_sb_info *sbinfo; list_for_each_entry(sbinfo, &sdcardfs_super_list, list) if (sbinfo) - fixup_perms(sbinfo->sb); + fixup_perms(sbinfo->sb, key); } static int insert_packagelist_entry(const char *key, appid_t value) @@ -164,7 +164,7 @@ static int insert_packagelist_entry(const char *key, appid_t value) mutex_lock(&sdcardfs_super_list_lock); err = insert_packagelist_entry_locked(key, value); if (!err) - fixup_all_perms(); + fixup_all_perms(key); mutex_unlock(&sdcardfs_super_list_lock); return err; @@ -196,7 +196,7 @@ static void remove_packagelist_entry(const char *key) { mutex_lock(&sdcardfs_super_list_lock); remove_packagelist_entry_locked(key); - fixup_all_perms(); + fixup_all_perms(key); mutex_unlock(&sdcardfs_super_list_lock); return; } diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 75284f339ae0..cfda98d257b6 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -169,6 +169,8 @@ struct sdcardfs_inode_info { userid_t userid; uid_t d_uid; bool under_android; + /* top folder for ownership */ + struct inode *top; struct inode vfs_inode; }; @@ -321,6 +323,35 @@ static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \ SDCARDFS_DENT_FUNC(lower_path) SDCARDFS_DENT_FUNC(orig_path) +/* grab a refererence if we aren't linking to ourself */ +static inline void set_top(struct sdcardfs_inode_info *info, struct inode *top) +{ + struct inode *old_top = NULL; + BUG_ON(IS_ERR_OR_NULL(top)); + if (info->top && info->top != &info->vfs_inode) { + old_top = info->top; + } + if (top != &info->vfs_inode) + igrab(top); + info->top = top; + iput(old_top); +} + +static inline struct inode *grab_top(struct sdcardfs_inode_info *info) +{ + struct inode *top = info->top; + if (top) { + return igrab(top); + } else { + return NULL; + } +} + +static inline void release_top(struct sdcardfs_inode_info *info) +{ + iput(info->top); +} + static inline int get_gid(struct sdcardfs_inode_info *info) { struct sdcardfs_sb_info *sb_info = SDCARDFS_SB(info->vfs_inode.i_sb); if (sb_info->options.gid == AID_SDCARD_RW) { @@ -337,7 +368,7 @@ static inline int get_gid(struct sdcardfs_inode_info *info) { static inline int get_mode(struct sdcardfs_inode_info *info) { int owner_mode; int filtered_mode; - struct sdcardfs_sb_info *sb_info = SDCARDFS_SB(info->vfs_inode.i_sb); + struct sdcardfs_sb_info * sb_info = SDCARDFS_SB(info->vfs_inode.i_sb); int visible_mode = 0775 & ~sb_info->options.mask; if (info->perm == PERM_PRE_ROOT) { @@ -403,11 +434,12 @@ extern int packagelist_init(void); extern void packagelist_exit(void); /* for derived_perm.c */ -extern void setup_derived_state(struct inode *inode, perm_t perm, - userid_t userid, uid_t uid, bool under_android); +extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, + uid_t uid, bool under_android, struct inode *top); extern void get_derived_permission(struct dentry *parent, struct dentry *dentry); extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry); -extern void get_derive_permissions_recursive(struct dentry *parent); +extern void fixup_top_recursive(struct dentry *parent); +extern void fixup_perms_recursive(struct dentry *dentry, const char *name, size_t len); extern void update_derived_permission_lock(struct dentry *dentry); extern int need_graft_path(struct dentry *dentry); diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index 1d6490128c99..0a465395aab7 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -126,6 +126,7 @@ static void sdcardfs_evict_inode(struct inode *inode) */ lower_inode = sdcardfs_lower_inode(inode); sdcardfs_set_lower_inode(inode, NULL); + set_top(SDCARDFS_I(inode), inode); iput(lower_inode); } From cae2e7f25c48be832ef5c7206346074db67599d4 Mon Sep 17 00:00:00 2001 From: alvin_liang Date: Mon, 19 Sep 2016 16:59:12 +0800 Subject: [PATCH 0617/3220] ANDROID: sdcardfs: fix external storage exporting incorrect uid Symptom: App cannot write into per-app folder Root Cause: sdcardfs exports incorrect uid Solution: fix uid Project: All Note: Test done by RD: passed Change-Id: Iff64f6f40ba4c679f07f4426d3db6e6d0db7e3ca --- fs/sdcardfs/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 6d526bf3d956..2decea3d1e3e 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -274,7 +274,7 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, sb_info->options.fs_low_uid, sb_info->options.fs_low_gid, 00755);*/ } else { - setup_derived_state(sb->s_root->d_inode, PERM_ROOT, sb_info->options.fs_low_uid, AID_ROOT, false, sb->s_root->d_inode); + setup_derived_state(d_inode(sb->s_root), PERM_ROOT, sb_info->options.fs_user_id, AID_ROOT, false, d_inode(sb->s_root)); snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name); } fix_derived_permission(sb->s_root->d_inode); From f32ddec923dd91d3c3c3cbb5fed48b4d36aac99c Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 26 Sep 2016 14:48:22 -0700 Subject: [PATCH 0618/3220] ANDROID: sdcardfs: Move directory unlock before touch This removes a deadlock under low memory conditions. filp_open can call lookup_slow, which will attempt to lock the parent. Change-Id: I940643d0793f5051d1e79a56f4da2fa8ca3d8ff7 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/inode.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 67bcee2c379a..3c353c95ef3e 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -296,14 +296,17 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode copied_fs = copy_fs_struct(current->fs); if (!copied_fs) { err = -ENOMEM; + unlock_dir(lower_parent_dentry); goto out_unlock; } current->fs = copied_fs; current->fs->umask = 0; err = vfs_mkdir(d_inode(lower_parent_dentry), lower_dentry, mode); - if (err) + if (err) { + unlock_dir(lower_parent_dentry); goto out; + } /* if it is a local obb dentry, setup it with the base obbpath */ if(need_graft_path(dentry)) { @@ -325,14 +328,18 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode } err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pi->userid); - if (err) + if (err) { + unlock_dir(lower_parent_dentry); goto out; + } fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry)); /* update number of links on parent directory */ set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink); + unlock_dir(lower_parent_dentry); + if ((!sbi->options.multiuser) && (!strcasecmp(dentry->d_name.name, "obb")) && (pi->perm == PERM_ANDROID) && (pi->userid == 0)) make_nomedia_in_obb = 1; @@ -352,7 +359,6 @@ out: current->fs = saved_fs; free_fs_struct(copied_fs); out_unlock: - unlock_dir(lower_parent_dentry); sdcardfs_put_lower_path(dentry, &lower_path); out_revert: REVERT_CRED(saved_cred); From 6b42d02561d335017cd6066f506514f32962fa2d Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 15:29:51 -0700 Subject: [PATCH 0619/3220] ANDROID: mnt: Add filesystem private data to mount points This starts to add private data associated directly to mount points. The intent is to give filesystems a sense of where they have come from, as a means of letting a filesystem take different actions based on this information. Change-Id: Ie769d7b3bb2f5972afe05c1bf16cf88c91647ab2 Signed-off-by: Daniel Rosenberg --- fs/namespace.c | 28 +++++++++++++++++++++++++++- fs/pnode.c | 14 ++++++++++++++ fs/pnode.h | 1 + include/linux/fs.h | 3 +++ include/linux/mount.h | 1 + 5 files changed, 46 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 0570729c87fd..c38f318eb9df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -577,6 +577,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { + kfree(mnt->mnt.data); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); @@ -942,11 +943,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (!mnt) return ERR_PTR(-ENOMEM); + mnt->mnt.data = NULL; + if (type->alloc_mnt_data) { + mnt->mnt.data = type->alloc_mnt_data(); + if (!mnt->mnt.data) { + mnt_free_id(mnt); + free_vfsmnt(mnt); + return ERR_PTR(-ENOMEM); + } + } if (flags & MS_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; root = mount_fs(type, flags, name, data); if (IS_ERR(root)) { + kfree(mnt->mnt.data); mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_CAST(root); @@ -974,6 +985,14 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (!mnt) return ERR_PTR(-ENOMEM); + if (sb->s_op->clone_mnt_data) { + mnt->mnt.data = sb->s_op->clone_mnt_data(old->mnt.data); + if (!mnt->mnt.data) { + err = -ENOMEM; + goto out_free; + } + } + if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) mnt->mnt_group_id = 0; /* not a peer of original */ else @@ -1042,6 +1061,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return mnt; out_free: + kfree(mnt->mnt.data); mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); @@ -2207,8 +2227,14 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = change_mount_flags(path->mnt, flags); else if (!capable(CAP_SYS_ADMIN)) err = -EPERM; - else + else { err = do_remount_sb(sb, flags, data, 0); + namespace_lock(); + lock_mount_hash(); + propagate_remount(mnt); + unlock_mount_hash(); + namespace_unlock(); + } if (!err) { lock_mount_hash(); mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; diff --git a/fs/pnode.c b/fs/pnode.c index 6367e1e435c6..4e2d78ec053a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -450,3 +450,17 @@ int propagate_umount(struct list_head *list) __propagate_umount(mnt); return 0; } + +int propagate_remount(struct mount *mnt) { + struct mount *m; + struct super_block *sb = mnt->mnt.mnt_sb; + int ret = 0; + + if (sb->s_op->copy_mnt_data) { + for (m = first_slave(mnt); m->mnt_slave.next != &mnt->mnt_slave_list; m = next_slave(m)) { + sb->s_op->copy_mnt_data(m->mnt.data, mnt->mnt.data); + } + } + + return ret; +} diff --git a/fs/pnode.h b/fs/pnode.h index 0fcdbe7ca648..4e8e94dc9e6a 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -44,6 +44,7 @@ int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, int propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); void propagate_mount_unlock(struct mount *); +int propagate_remount(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); diff --git a/include/linux/fs.h b/include/linux/fs.h index 3aa514254161..3996b8ec0b84 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1697,6 +1697,8 @@ struct super_operations { int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); + void *(*clone_mnt_data) (void *); + void (*copy_mnt_data) (void *, void *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); @@ -1931,6 +1933,7 @@ struct file_system_type { #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); + void *(*alloc_mnt_data) (void); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; diff --git a/include/linux/mount.h b/include/linux/mount.h index f822c3c11377..0e9b0977237a 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -67,6 +67,7 @@ struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; + void *data; }; struct file; /* forward dec */ From 0ad2dd493c2fb614cb0b7f2355990ce035224590 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 15:58:22 -0700 Subject: [PATCH 0620/3220] ANDROID: vfs: Allow filesystems to access their private mount data Now we pass the vfsmount when mounting and remounting. This allows the filesystem to actually set up the mount specific data, although we can't quite do anything with it yet. show_options is expanded to include data that lives with the mount. To avoid changing existing filesystems, these have been added as new vfs functions. Change-Id: If80670bfad9f287abb8ac22457e1b034c9697097 Signed-off-by: Daniel Rosenberg --- fs/internal.h | 4 +++- fs/namespace.c | 4 ++-- fs/proc_namespace.c | 8 ++++++-- fs/super.c | 28 +++++++++++++++++++++++----- include/linux/fs.h | 4 ++++ 5 files changed, 38 insertions(+), 10 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index 71859c4d0b41..6387b35a1c0d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -84,9 +84,11 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); +extern int do_remount_sb2(struct vfsmount *, struct super_block *, int, + void *, int); extern bool trylock_super(struct super_block *sb); extern struct dentry *mount_fs(struct file_system_type *, - int, const char *, void *); + int, const char *, struct vfsmount *, void *); extern struct super_block *user_get_super(dev_t); /* diff --git a/fs/namespace.c b/fs/namespace.c index c38f318eb9df..90c47548ab61 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -955,7 +955,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (flags & MS_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; - root = mount_fs(type, flags, name, data); + root = mount_fs(type, flags, name, &mnt->mnt, data); if (IS_ERR(root)) { kfree(mnt->mnt.data); mnt_free_id(mnt); @@ -2228,7 +2228,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, else if (!capable(CAP_SYS_ADMIN)) err = -EPERM; else { - err = do_remount_sb(sb, flags, data, 0); + err = do_remount_sb2(path->mnt, sb, flags, data, 0); namespace_lock(); lock_mount_hash(); propagate_remount(mnt); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 8ebd9a334085..a0770fee84d2 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -118,7 +118,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) if (err) goto out; show_mnt_opts(m, mnt); - if (sb->s_op->show_options) + if (sb->s_op->show_options2) + err = sb->s_op->show_options2(mnt, m, mnt_path.dentry); + else if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt_path.dentry); seq_puts(m, " 0 0\n"); out: @@ -178,7 +180,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) err = show_sb_opts(m, sb); if (err) goto out; - if (sb->s_op->show_options) + if (sb->s_op->show_options2) { + err = sb->s_op->show_options2(mnt, m, mnt->mnt_root); + } else if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt->mnt_root); seq_putc(m, '\n'); out: diff --git a/fs/super.c b/fs/super.c index 1014e7cc355f..589aff731caa 100644 --- a/fs/super.c +++ b/fs/super.c @@ -702,7 +702,8 @@ rescan: } /** - * do_remount_sb - asks filesystem to change mount options. + * do_remount_sb2 - asks filesystem to change mount options. + * @mnt: mount we are looking at * @sb: superblock in question * @flags: numeric part of options * @data: the rest of options @@ -710,7 +711,7 @@ rescan: * * Alters the mount options of a mounted file system. */ -int do_remount_sb(struct super_block *sb, int flags, void *data, int force) +int do_remount_sb2(struct vfsmount *mnt, struct super_block *sb, int flags, void *data, int force) { int retval; int remount_ro; @@ -752,7 +753,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) } } - if (sb->s_op->remount_fs) { + if (mnt && sb->s_op->remount_fs2) { + retval = sb->s_op->remount_fs2(mnt, sb, &flags, data); + if (retval) { + if (!force) + goto cancel_readonly; + /* If forced remount, go ahead despite any errors */ + WARN(1, "forced remount of a %s fs returned %i\n", + sb->s_type->name, retval); + } + } else if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); if (retval) { if (!force) @@ -784,6 +794,11 @@ cancel_readonly: return retval; } +int do_remount_sb(struct super_block *sb, int flags, void *data, int force) +{ + return do_remount_sb2(NULL, sb, flags, data, force); +} + static void do_emergency_remount(struct work_struct *work) { struct super_block *sb, *p = NULL; @@ -1103,7 +1118,7 @@ struct dentry *mount_single(struct file_system_type *fs_type, EXPORT_SYMBOL(mount_single); struct dentry * -mount_fs(struct file_system_type *type, int flags, const char *name, void *data) +mount_fs(struct file_system_type *type, int flags, const char *name, struct vfsmount *mnt, void *data) { struct dentry *root; struct super_block *sb; @@ -1120,7 +1135,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) goto out_free_secdata; } - root = type->mount(type, flags, name, data); + if (type->mount2) + root = type->mount2(mnt, type, flags, name, data); + else + root = type->mount(type, flags, name, data); if (IS_ERR(root)) { error = PTR_ERR(root); goto out_free_secdata; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3996b8ec0b84..31382e732e9e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1697,11 +1697,13 @@ struct super_operations { int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); + int (*remount_fs2) (struct vfsmount *, struct super_block *, int *, char *); void *(*clone_mnt_data) (void *); void (*copy_mnt_data) (void *, void *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); + int (*show_options2)(struct vfsmount *,struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); @@ -1933,6 +1935,8 @@ struct file_system_type { #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); + struct dentry *(*mount2) (struct vfsmount *, struct file_system_type *, int, + const char *, void *); void *(*alloc_mnt_data) (void); void (*kill_sb) (struct super_block *); struct module *owner; From 21fc44e40ae3c574c288159846ab5a4762bd0e3f Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 16:27:45 -0700 Subject: [PATCH 0621/3220] ANDROID: vfs: Add permission2 for filesystems with per mount permissions This allows filesystems to use their mount private data to influence the permssions they return in permission2. It has been separated into a new call to avoid disrupting current permission users. Change-Id: I9d416e3b8b6eca84ef3e336bd2af89ddd51df6ca Signed-off-by: Daniel Rosenberg --- fs/exec.c | 2 +- fs/namei.c | 175 ++++++++++++++++++++--------- fs/notify/fanotify/fanotify_user.c | 2 +- fs/notify/inotify/inotify_user.c | 2 +- fs/open.c | 16 ++- fs/utimes.c | 2 +- include/linux/fs.h | 11 ++ include/linux/namei.h | 1 + ipc/mqueue.c | 10 +- security/inode.c | 2 +- 10 files changed, 158 insertions(+), 65 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index b06623a9347f..50f062d64d3a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1123,7 +1123,7 @@ EXPORT_SYMBOL(flush_old_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { - if (inode_permission(file_inode(file), MAY_READ) < 0) + if (inode_permission2(file->f_path.mnt, file_inode(file), MAY_READ) < 0) bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; } EXPORT_SYMBOL(would_dump); diff --git a/fs/namei.c b/fs/namei.c index 0c3974cd3ecd..82c7ec6532d5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -373,9 +373,11 @@ EXPORT_SYMBOL(generic_permission); * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ -static inline int do_inode_permission(struct inode *inode, int mask) +static inline int do_inode_permission(struct vfsmount *mnt, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { + if (likely(mnt && inode->i_op->permission2)) + return inode->i_op->permission2(mnt, inode, mask); if (likely(inode->i_op->permission)) return inode->i_op->permission(inode, mask); @@ -399,7 +401,7 @@ static inline int do_inode_permission(struct inode *inode, int mask) * This does not check for a read-only file system. You probably want * inode_permission(). */ -int __inode_permission(struct inode *inode, int mask) +int __inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask) { int retval; @@ -411,7 +413,7 @@ int __inode_permission(struct inode *inode, int mask) return -EACCES; } - retval = do_inode_permission(inode, mask); + retval = do_inode_permission(mnt, inode, mask); if (retval) return retval; @@ -419,7 +421,14 @@ int __inode_permission(struct inode *inode, int mask) if (retval) return retval; - return security_inode_permission(inode, mask); + retval = security_inode_permission(inode, mask); + return retval; +} +EXPORT_SYMBOL(__inode_permission2); + +int __inode_permission(struct inode *inode, int mask) +{ + return __inode_permission2(NULL, inode, mask); } EXPORT_SYMBOL(__inode_permission); @@ -455,14 +464,20 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ -int inode_permission(struct inode *inode, int mask) +int inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); if (retval) return retval; - return __inode_permission(inode, mask); + return __inode_permission2(mnt, inode, mask); +} +EXPORT_SYMBOL(inode_permission2); + +int inode_permission(struct inode *inode, int mask) +{ + return inode_permission2(NULL, inode, mask); } EXPORT_SYMBOL(inode_permission); @@ -1643,13 +1658,13 @@ static int lookup_slow(struct nameidata *nd, struct path *path) static inline int may_lookup(struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { - int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); + int err = inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD) return err; if (unlazy_walk(nd, NULL, 0)) return -ECHILD; } - return inode_permission(nd->inode, MAY_EXEC); + return inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC); } static inline int handle_dots(struct nameidata *nd, int type) @@ -1998,11 +2013,12 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->depth = 0; if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; + struct vfsmount *mnt = nd->root.mnt; struct inode *inode = root->d_inode; if (*s) { if (!d_can_lookup(root)) return ERR_PTR(-ENOTDIR); - retval = inode_permission(inode, MAY_EXEC); + retval = inode_permission2(mnt, inode, MAY_EXEC); if (retval) return ERR_PTR(retval); } @@ -2273,13 +2289,14 @@ EXPORT_SYMBOL(vfs_path_lookup); /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup + * @mnt: mount we are looking up on * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. */ -struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) +struct dentry *lookup_one_len2(const char *name, struct vfsmount *mnt, struct dentry *base, int len) { struct qstr this; unsigned int c; @@ -2313,12 +2330,18 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return ERR_PTR(err); } - err = inode_permission(base->d_inode, MAY_EXEC); + err = inode_permission2(mnt, base->d_inode, MAY_EXEC); if (err) return ERR_PTR(err); return __lookup_hash(&this, base, 0); } +EXPORT_SYMBOL(lookup_one_len2); + +struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) +{ + return lookup_one_len2(name, NULL, base, len); +} EXPORT_SYMBOL(lookup_one_len); int user_path_at_empty(int dfd, const char __user *name, unsigned flags, @@ -2545,7 +2568,7 @@ EXPORT_SYMBOL(__check_sticky); * 10. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) +static int may_delete(struct vfsmount *mnt, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; @@ -2557,7 +2580,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) BUG_ON(victim->d_parent->d_inode != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + error = inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) @@ -2588,14 +2611,14 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) * 3. We should have write and exec permissions on dir * 4. We can't do it if dir is immutable (done in permission()) */ -static inline int may_create(struct inode *dir, struct dentry *child) +static inline int may_create(struct vfsmount *mnt, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(dir, MAY_WRITE | MAY_EXEC); + return inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC); } /* @@ -2642,10 +2665,10 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } EXPORT_SYMBOL(unlock_rename); -int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool want_excl) +int vfs_create2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, + umode_t mode, bool want_excl) { - int error = may_create(dir, dentry); + int error = may_create(mnt, dir, dentry); if (error) return error; @@ -2661,11 +2684,19 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_create2); + +int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool want_excl) +{ + return vfs_create2(NULL, dir, dentry, mode, want_excl); +} EXPORT_SYMBOL(vfs_create); static int may_open(struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; + struct vfsmount *mnt = path->mnt; struct inode *inode = dentry->d_inode; int error; @@ -2694,7 +2725,7 @@ static int may_open(struct path *path, int acc_mode, int flag) break; } - error = inode_permission(inode, acc_mode); + error = inode_permission2(mnt, inode, acc_mode); if (error) return error; @@ -2750,7 +2781,7 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode) if (error) return error; - error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); + error = inode_permission2(dir->mnt, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -2948,6 +2979,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, bool got_write, int *opened) { struct dentry *dir = nd->path.dentry; + struct vfsmount *mnt = nd->path.mnt; struct inode *dir_inode = dir->d_inode; struct dentry *dentry; int error; @@ -2995,7 +3027,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, error = security_path_mknod(&nd->path, dentry, mode, 0); if (error) goto out_dput; - error = vfs_create(dir->d_inode, dentry, mode, + error = vfs_create2(mnt, dir->d_inode, dentry, mode, nd->flags & LOOKUP_EXCL); if (error) goto out_dput; @@ -3254,7 +3286,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, goto out; dir = path.dentry->d_inode; /* we want directory to be writable */ - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + error = inode_permission2(path.mnt, dir, MAY_WRITE | MAY_EXEC); if (error) goto out2; if (!dir->i_op->tmpfile) { @@ -3488,9 +3520,9 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, } EXPORT_SYMBOL(user_path_create); -int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +int vfs_mknod2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int error = may_create(dir, dentry); + int error = may_create(mnt, dir, dentry); if (error) return error; @@ -3514,6 +3546,12 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mknod2); + +int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +{ + return vfs_mknod2(NULL, dir, dentry, mode, dev); +} EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) @@ -3556,10 +3594,10 @@ retry: goto out; switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(path.dentry->d_inode,dentry,mode,true); + error = vfs_create2(path.mnt, path.dentry->d_inode,dentry,mode,true); break; case S_IFCHR: case S_IFBLK: - error = vfs_mknod(path.dentry->d_inode,dentry,mode, + error = vfs_mknod2(path.mnt, path.dentry->d_inode,dentry,mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: @@ -3580,9 +3618,9 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d return sys_mknodat(AT_FDCWD, filename, mode, dev); } -int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +int vfs_mkdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode) { - int error = may_create(dir, dentry); + int error = may_create(mnt, dir, dentry); unsigned max_links = dir->i_sb->s_max_links; if (error) @@ -3604,6 +3642,12 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fsnotify_mkdir(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mkdir2); + +int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return vfs_mkdir2(NULL, dir, dentry, mode); +} EXPORT_SYMBOL(vfs_mkdir); SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) @@ -3622,7 +3666,7 @@ retry: mode &= ~current_umask(); error = security_path_mkdir(&path, dentry, mode); if (!error) - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + error = vfs_mkdir2(path.mnt, path.dentry->d_inode, dentry, mode); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -3661,9 +3705,9 @@ void dentry_unhash(struct dentry *dentry) } EXPORT_SYMBOL(dentry_unhash); -int vfs_rmdir(struct inode *dir, struct dentry *dentry) +int vfs_rmdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry) { - int error = may_delete(dir, dentry, 1); + int error = may_delete(mnt, dir, dentry, 1); if (error) return error; @@ -3698,6 +3742,10 @@ out: d_delete(dentry); return error; } +int vfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + return vfs_rmdir2(NULL, dir, dentry); +} EXPORT_SYMBOL(vfs_rmdir); static long do_rmdir(int dfd, const char __user *pathname) @@ -3743,7 +3791,7 @@ retry: error = security_path_rmdir(&path, dentry); if (error) goto exit3; - error = vfs_rmdir(path.dentry->d_inode, dentry); + error = vfs_rmdir2(path.mnt, path.dentry->d_inode, dentry); exit3: dput(dentry); exit2: @@ -3782,10 +3830,10 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. */ -int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) +int vfs_unlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; - int error = may_delete(dir, dentry, 0); + int error = may_delete(mnt, dir, dentry, 0); if (error) return error; @@ -3820,6 +3868,12 @@ out: return error; } +EXPORT_SYMBOL(vfs_unlink2); + +int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) +{ + return vfs_unlink2(NULL, dir, dentry, delegated_inode); +} EXPORT_SYMBOL(vfs_unlink); /* @@ -3867,7 +3921,7 @@ retry_deleg: error = security_path_unlink(&path, dentry); if (error) goto exit2; - error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode); + error = vfs_unlink2(path.mnt, path.dentry->d_inode, dentry, &delegated_inode); exit2: dput(dentry); } @@ -3917,9 +3971,9 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) return do_unlinkat(AT_FDCWD, pathname); } -int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) +int vfs_symlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, const char *oldname) { - int error = may_create(dir, dentry); + int error = may_create(mnt, dir, dentry); if (error) return error; @@ -3936,6 +3990,12 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_symlink2); + +int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) +{ + return vfs_symlink2(NULL, dir, dentry, oldname); +} EXPORT_SYMBOL(vfs_symlink); SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, @@ -3958,7 +4018,7 @@ retry: error = security_path_symlink(&path, dentry, from->name); if (!error) - error = vfs_symlink(path.dentry->d_inode, dentry, from->name); + error = vfs_symlink2(path.mnt, path.dentry->d_inode, dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -3993,7 +4053,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. */ -int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) +int vfs_link2(struct vfsmount *mnt, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -4002,7 +4062,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (!inode) return -ENOENT; - error = may_create(dir, new_dentry); + error = may_create(mnt, dir, new_dentry); if (error) return error; @@ -4045,6 +4105,12 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de fsnotify_link(dir, inode, new_dentry); return error; } +EXPORT_SYMBOL(vfs_link2); + +int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) +{ + return vfs_link2(NULL, old_dentry, dir, new_dentry, delegated_inode); +} EXPORT_SYMBOL(vfs_link); /* @@ -4100,7 +4166,7 @@ retry: error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); + error = vfs_link2(old_path.mnt, old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { @@ -4175,7 +4241,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ -int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, +int vfs_rename2(struct vfsmount *mnt, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, struct inode **delegated_inode, unsigned int flags) { @@ -4190,19 +4257,19 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (source == target) return 0; - error = may_delete(old_dir, old_dentry, is_dir); + error = may_delete(mnt, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(new_dir, new_dentry); + error = may_create(mnt, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(new_dir, new_dentry, is_dir); + error = may_delete(mnt, new_dir, new_dentry, is_dir); else - error = may_delete(new_dir, new_dentry, new_is_dir); + error = may_delete(mnt, new_dir, new_dentry, new_is_dir); } if (error) return error; @@ -4219,12 +4286,12 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(source, MAY_WRITE); + error = inode_permission2(mnt, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(target, MAY_WRITE); + error = inode_permission2(mnt, target, MAY_WRITE); if (error) return error; } @@ -4307,6 +4374,14 @@ out: return error; } +EXPORT_SYMBOL(vfs_rename2); + +int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + struct inode **delegated_inode, unsigned int flags) +{ + return vfs_rename2(NULL, old_dir, old_dentry, new_dir, new_dentry, delegated_inode, flags); +} EXPORT_SYMBOL(vfs_rename); SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, @@ -4420,7 +4495,7 @@ retry_deleg: &new_path, new_dentry, flags); if (error) goto exit5; - error = vfs_rename(old_path.dentry->d_inode, old_dentry, + error = vfs_rename2(old_path.mnt, old_path.dentry->d_inode, old_dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode, flags); exit5: @@ -4465,7 +4540,7 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna int vfs_whiteout(struct inode *dir, struct dentry *dentry) { - int error = may_create(dir, dentry); + int error = may_create(NULL, dir, dentry); if (error) return error; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8e8e6bcd1d43..72559b94f159 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -475,7 +475,7 @@ static int fanotify_find_path(int dfd, const char __user *filename, } /* you can only watch an inode if you have read permissions on it */ - ret = inode_permission(path->dentry->d_inode, MAY_READ); + ret = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ); if (ret) path_put(path); out: diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e2893f17dde2..4c5b43d15e6e 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -337,7 +337,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns if (error) return error; /* you can only watch an inode if you have read permissions on it */ - error = inode_permission(path->dentry->d_inode, MAY_READ); + error = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ); if (error) path_put(path); return error; diff --git a/fs/open.c b/fs/open.c index b6f1e96a7c0b..1786467725b9 100644 --- a/fs/open.c +++ b/fs/open.c @@ -68,9 +68,11 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, long vfs_truncate(struct path *path, loff_t length) { struct inode *inode; + struct vfsmount *mnt; long error; inode = path->dentry->d_inode; + mnt = path->mnt; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ if (S_ISDIR(inode->i_mode)) @@ -82,7 +84,7 @@ long vfs_truncate(struct path *path, loff_t length) if (error) goto out; - error = inode_permission(inode, MAY_WRITE); + error = inode_permission2(mnt, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; @@ -340,6 +342,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) struct cred *override_cred; struct path path; struct inode *inode; + struct vfsmount *mnt; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; @@ -370,6 +373,7 @@ retry: goto out; inode = d_backing_inode(path.dentry); + mnt = path.mnt; if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* @@ -381,7 +385,7 @@ retry: goto out_path_release; } - res = inode_permission(inode, mode | MAY_ACCESS); + res = inode_permission2(mnt, inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; @@ -425,7 +429,7 @@ retry: if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -445,6 +449,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd) { struct fd f = fdget_raw(fd); struct inode *inode; + struct vfsmount *mnt; int error = -EBADF; error = -EBADF; @@ -452,12 +457,13 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd) goto out; inode = file_inode(f.file); + mnt = f.file->f_path.mnt; error = -ENOTDIR; if (!S_ISDIR(inode->i_mode)) goto out_putf; - error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); + error = inode_permission2(mnt, inode, MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &f.file->f_path); out_putf: @@ -476,7 +482,7 @@ retry: if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; diff --git a/fs/utimes.c b/fs/utimes.c index aa138d64560a..c3c663dbf083 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -97,7 +97,7 @@ static int utimes_common(struct path *path, struct timespec *times) goto mnt_drop_write_and_out; if (!inode_owner_or_capable(inode)) { - error = inode_permission(inode, MAY_WRITE); + error = inode_permission2(path->mnt, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 31382e732e9e..b5a863580120 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1503,13 +1503,21 @@ extern bool inode_owner_or_capable(const struct inode *inode); * VFS helper functions.. */ extern int vfs_create(struct inode *, struct dentry *, umode_t, bool); +extern int vfs_create2(struct vfsmount *, struct inode *, struct dentry *, umode_t, bool); extern int vfs_mkdir(struct inode *, struct dentry *, umode_t); +extern int vfs_mkdir2(struct vfsmount *, struct inode *, struct dentry *, umode_t); extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +extern int vfs_mknod2(struct vfsmount *, struct inode *, struct dentry *, umode_t, dev_t); extern int vfs_symlink(struct inode *, struct dentry *, const char *); +extern int vfs_symlink2(struct vfsmount *, struct inode *, struct dentry *, const char *); extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **); +extern int vfs_link2(struct vfsmount *, struct dentry *, struct inode *, struct dentry *, struct inode **); extern int vfs_rmdir(struct inode *, struct dentry *); +extern int vfs_rmdir2(struct vfsmount *, struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); +extern int vfs_unlink2(struct vfsmount *, struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); +extern int vfs_rename2(struct vfsmount *, struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); extern int vfs_whiteout(struct inode *, struct dentry *); /* @@ -1635,6 +1643,7 @@ struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*follow_link) (struct dentry *, void **); int (*permission) (struct inode *, int); + int (*permission2) (struct vfsmount *, struct inode *, int); struct posix_acl * (*get_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); @@ -2442,7 +2451,9 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); +extern int inode_permission2(struct vfsmount *, struct inode *, int); extern int __inode_permission(struct inode *, int); +extern int __inode_permission2(struct vfsmount *, struct inode *, int); extern int generic_permission(struct inode *, int); extern int __check_sticky(struct inode *dir, struct inode *inode); diff --git a/include/linux/namei.h b/include/linux/namei.h index d53c25453aca..023359f18567 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -79,6 +79,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); +extern struct dentry *lookup_one_len2(const char *, struct vfsmount *mnt, struct dentry *, int); extern int follow_down_one(struct path *); extern int follow_down(struct path *); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 161a1807e6ef..25b7a678f9ef 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -747,7 +747,7 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir, } mode &= ~current_umask(); - ret = vfs_create(dir, path->dentry, mode, true); + ret = vfs_create2(path->mnt, dir, path->dentry, mode, true); path->dentry->d_fsdata = NULL; if (ret) return ERR_PTR(ret); @@ -763,7 +763,7 @@ static struct file *do_open(struct path *path, int oflag) if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) return ERR_PTR(-EINVAL); acc = oflag2acc[oflag & O_ACCMODE]; - if (inode_permission(d_inode(path->dentry), acc)) + if (inode_permission2(path->mnt, d_inode(path->dentry), acc)) return ERR_PTR(-EACCES); return dentry_open(path, oflag, current_cred()); } @@ -796,7 +796,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, ro = mnt_want_write(mnt); /* we'll drop it in any case */ error = 0; mutex_lock(&d_inode(root)->i_mutex); - path.dentry = lookup_one_len(name->name, root, strlen(name->name)); + path.dentry = lookup_one_len2(name->name, mnt, root, strlen(name->name)); if (IS_ERR(path.dentry)) { error = PTR_ERR(path.dentry); goto out_putfd; @@ -867,7 +867,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) if (err) goto out_name; mutex_lock_nested(&d_inode(mnt->mnt_root)->i_mutex, I_MUTEX_PARENT); - dentry = lookup_one_len(name->name, mnt->mnt_root, + dentry = lookup_one_len2(name->name, mnt, mnt->mnt_root, strlen(name->name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -879,7 +879,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = -ENOENT; } else { ihold(inode); - err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL); + err = vfs_unlink2(mnt, d_inode(dentry->d_parent), dentry, NULL); } dput(dentry); diff --git a/security/inode.c b/security/inode.c index 16622aef9bde..0f1a041bf6cb 100644 --- a/security/inode.c +++ b/security/inode.c @@ -100,7 +100,7 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode, dir = d_inode(parent); mutex_lock(&dir->i_mutex); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_one_len2(name, mount, parent, strlen(name)); if (IS_ERR(dentry)) goto out; From 1cbf8e31e3a7440a3ada22ba06f62d6359c27c0e Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 16:33:11 -0700 Subject: [PATCH 0622/3220] ANDROID: vfs: Add setattr2 for filesystems with per mount permissions This allows filesystems to use their mount private data to influence the permssions they use in setattr2. It has been separated into a new call to avoid disrupting current setattr users. Change-Id: I19959038309284448f1b7f232d579674ef546385 Signed-off-by: Daniel Rosenberg --- fs/attr.c | 12 ++++++++++-- fs/coredump.c | 2 +- fs/inode.c | 6 +++--- fs/namei.c | 2 +- fs/open.c | 21 ++++++++++++++------- fs/utimes.c | 2 +- include/linux/fs.h | 4 ++++ 7 files changed, 34 insertions(+), 15 deletions(-) diff --git a/fs/attr.c b/fs/attr.c index 6530ced19697..34926834cd49 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -187,7 +187,7 @@ EXPORT_SYMBOL(setattr_copy); * the file open for write, as there can be no conflicting delegation in * that case. */ -int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode) +int notify_change2(struct vfsmount *mnt, struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; @@ -262,7 +262,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de if (error) return error; - if (inode->i_op->setattr) + if (mnt && inode->i_op->setattr2) + error = inode->i_op->setattr2(mnt, dentry, attr); + else if (inode->i_op->setattr) error = inode->i_op->setattr(dentry, attr); else error = simple_setattr(dentry, attr); @@ -275,4 +277,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return error; } +EXPORT_SYMBOL(notify_change2); + +int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode) +{ + return notify_change2(NULL, dentry, attr, delegated_inode); +} EXPORT_SYMBOL(notify_change); diff --git a/fs/coredump.c b/fs/coredump.c index 1777331eee76..84d251b582e9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -695,7 +695,7 @@ void do_coredump(const siginfo_t *siginfo) goto close_fail; if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; - if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) + if (do_truncate2(cprm.file->f_path.mnt, cprm.file->f_path.dentry, 0, 0, cprm.file)) goto close_fail; } diff --git a/fs/inode.c b/fs/inode.c index 1be5f9003eb3..aa3cf5a1b819 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1715,7 +1715,7 @@ int dentry_needs_remove_privs(struct dentry *dentry) } EXPORT_SYMBOL(dentry_needs_remove_privs); -static int __remove_privs(struct dentry *dentry, int kill) +static int __remove_privs(struct vfsmount *mnt, struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1724,7 +1724,7 @@ static int __remove_privs(struct dentry *dentry, int kill) * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ - return notify_change(dentry, &newattrs, NULL); + return notify_change2(mnt, dentry, &newattrs, NULL); } /* @@ -1746,7 +1746,7 @@ int file_remove_privs(struct file *file) if (kill < 0) return kill; if (kill) - error = __remove_privs(dentry, kill); + error = __remove_privs(file->f_path.mnt, dentry, kill); if (!error) inode_has_no_xattr(inode); diff --git a/fs/namei.c b/fs/namei.c index 82c7ec6532d5..0230bc59f344 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2760,7 +2760,7 @@ static int handle_truncate(struct file *filp) if (!error) error = security_path_truncate(path); if (!error) { - error = do_truncate(path->dentry, 0, + error = do_truncate2(path->mnt, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } diff --git a/fs/open.c b/fs/open.c index 1786467725b9..08406e19e2cf 100644 --- a/fs/open.c +++ b/fs/open.c @@ -34,8 +34,8 @@ #include "internal.h" -int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, - struct file *filp) +int do_truncate2(struct vfsmount *mnt, struct dentry *dentry, loff_t length, + unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; @@ -60,10 +60,15 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, mutex_lock(&dentry->d_inode->i_mutex); /* Note any delegations or leases have already been broken: */ - ret = notify_change(dentry, &newattrs, NULL); + ret = notify_change2(mnt, dentry, &newattrs, NULL); mutex_unlock(&dentry->d_inode->i_mutex); return ret; } +int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, + struct file *filp) +{ + return do_truncate2(NULL, dentry, length, time_attrs, filp); +} long vfs_truncate(struct path *path, loff_t length) { @@ -108,7 +113,7 @@ long vfs_truncate(struct path *path, loff_t length) if (!error) error = security_path_truncate(path); if (!error) - error = do_truncate(path->dentry, length, 0, NULL); + error = do_truncate2(mnt, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); @@ -157,6 +162,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct inode *inode; struct dentry *dentry; + struct vfsmount *mnt; struct fd f; int error; @@ -173,6 +179,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) small = 0; dentry = f.file->f_path.dentry; + mnt = f.file->f_path.mnt; inode = dentry->d_inode; error = -EINVAL; if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) @@ -192,7 +199,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (!error) error = security_path_truncate(&f.file->f_path); if (!error) - error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file); + error = do_truncate2(mnt, dentry, length, ATTR_MTIME|ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: fdput(f); @@ -522,7 +529,7 @@ retry_deleg: goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode); out_unlock: mutex_unlock(&inode->i_mutex); if (delegated_inode) { @@ -602,7 +609,7 @@ retry_deleg: mutex_lock(&inode->i_mutex); error = security_path_chown(path, uid, gid); if (!error) - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode); mutex_unlock(&inode->i_mutex); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); diff --git a/fs/utimes.c b/fs/utimes.c index c3c663dbf083..dfb457546c60 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -104,7 +104,7 @@ static int utimes_common(struct path *path, struct timespec *times) } retry_deleg: mutex_lock(&inode->i_mutex); - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode); mutex_unlock(&inode->i_mutex); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index b5a863580120..31bf8e1e58ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1661,6 +1661,7 @@ struct inode_operations { int (*rename2) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*setattr) (struct dentry *, struct iattr *); + int (*setattr2) (struct vfsmount *, struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); @@ -2226,6 +2227,8 @@ struct filename { extern long vfs_truncate(struct path *, loff_t); extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); +extern int do_truncate2(struct vfsmount *, struct dentry *, loff_t start, + unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern long do_sys_open(int dfd, const char __user *filename, int flags, @@ -2450,6 +2453,7 @@ extern void emergency_remount(void); extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *, struct inode **); +extern int notify_change2(struct vfsmount *, struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); extern int inode_permission2(struct vfsmount *, struct inode *, int); extern int __inode_permission(struct inode *, int); From f9cb61dcb00ca9a04b98a9f35ac09f3004c87138 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 16:48:45 -0700 Subject: [PATCH 0623/3220] ANDROID: sdcardfs: User new permission2 functions Change-Id: Ic7e0fb8fdcebb31e657b079fe02ac834c4a50db9 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/inode.c | 25 +++++++++++++++++++------ fs/sdcardfs/sdcardfs.h | 4 ++-- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 3c353c95ef3e..dc64c9e2f5e7 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -54,6 +54,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, { int err; struct dentry *lower_dentry; + struct vfsmount *lower_dentry_mnt; struct dentry *lower_parent_dentry = NULL; struct path lower_path; const struct cred *saved_cred = NULL; @@ -73,6 +74,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; + lower_dentry_mnt = lower_path.mnt; lower_parent_dentry = lock_parent(lower_dentry); /* set last 16bytes of mode field to 0664 */ @@ -87,7 +89,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, } current->fs = copied_fs; current->fs->umask = 0; - err = vfs_create(d_inode(lower_parent_dentry), lower_dentry, mode, want_excl); + err = vfs_create2(lower_dentry_mnt, d_inode(lower_parent_dentry), lower_dentry, mode, want_excl); if (err) goto out; @@ -154,6 +156,7 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) { int err; struct dentry *lower_dentry; + struct vfsmount *lower_mnt; struct inode *lower_dir_inode = sdcardfs_lower_inode(dir); struct dentry *lower_dir_dentry; struct path lower_path; @@ -172,10 +175,11 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; + lower_mnt = lower_path.mnt; dget(lower_dentry); lower_dir_dentry = lock_parent(lower_dentry); - err = vfs_unlink(lower_dir_inode, lower_dentry, NULL); + err = vfs_unlink2(lower_mnt, lower_dir_inode, lower_dentry, NULL); /* * Note: unlinking on top of NFS can cause silly-renamed files. @@ -256,6 +260,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode int err; int make_nomedia_in_obb = 0; struct dentry *lower_dentry; + struct vfsmount *lower_mnt; struct dentry *lower_parent_dentry = NULL; struct path lower_path; struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); @@ -286,6 +291,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode /* the lower_dentry is negative here */ sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; + lower_mnt = lower_path.mnt; lower_parent_dentry = lock_parent(lower_dentry); /* set last 16bytes of mode field to 0775 */ @@ -301,7 +307,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode } current->fs = copied_fs; current->fs->umask = 0; - err = vfs_mkdir(d_inode(lower_parent_dentry), lower_dentry, mode); + err = vfs_mkdir2(lower_mnt, d_inode(lower_parent_dentry), lower_dentry, mode); if (err) { unlock_dir(lower_parent_dentry); @@ -370,6 +376,7 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) { struct dentry *lower_dentry; struct dentry *lower_dir_dentry; + struct vfsmount *lower_mnt; int err; struct path lower_path; const struct cred *saved_cred = NULL; @@ -390,9 +397,10 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) sdcardfs_get_real_lower(dentry, &lower_path); lower_dentry = lower_path.dentry; + lower_mnt = lower_path.mnt; lower_dir_dentry = lock_parent(lower_dentry); - err = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry); + err = vfs_rmdir2(lower_mnt, d_inode(lower_dir_dentry), lower_dentry); if (err) goto out; @@ -456,6 +464,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dentry = NULL; struct dentry *lower_old_dir_dentry = NULL; struct dentry *lower_new_dir_dentry = NULL; + struct vfsmount *lower_mnt = NULL; struct dentry *trap = NULL; struct dentry *new_parent = NULL; struct path lower_old_path, lower_new_path; @@ -477,6 +486,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, sdcardfs_get_lower_path(new_dentry, &lower_new_path); lower_old_dentry = lower_old_path.dentry; lower_new_dentry = lower_new_path.dentry; + lower_mnt = lower_old_path.mnt; lower_old_dir_dentry = dget_parent(lower_old_dentry); lower_new_dir_dentry = dget_parent(lower_new_dentry); @@ -492,7 +502,8 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } - err = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry, + err = vfs_rename2(lower_mnt, + d_inode(lower_old_dir_dentry), lower_old_dentry, d_inode(lower_new_dir_dentry), lower_new_dentry, NULL, 0); if (err) @@ -642,6 +653,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) { int err; struct dentry *lower_dentry; + struct vfsmount *lower_mnt; struct inode *inode; struct inode *lower_inode; struct path lower_path; @@ -675,6 +687,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; + lower_mnt = lower_path.mnt; lower_inode = sdcardfs_lower_inode(inode); /* prepare our own lower struct iattr (with the lower file) */ @@ -718,7 +731,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) * tries to open(), unlink(), then ftruncate() a file. */ mutex_lock(&d_inode(lower_dentry)->i_mutex); - err = notify_change(lower_dentry, &lower_ia, /* note: lower_ia */ + err = notify_change2(lower_mnt, lower_dentry, &lower_ia, /* note: lower_ia */ NULL); mutex_unlock(&d_inode(lower_dentry)->i_mutex); if (current->mm) diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index cfda98d257b6..5132f1dc5a4d 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -476,7 +476,7 @@ static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t m goto out_unlock; } - err = vfs_mkdir(d_inode(parent.dentry), dent, mode); + err = vfs_mkdir2(parent.mnt, d_inode(parent.dentry), dent, mode); if (err) { if (err == -EEXIST) err = 0; @@ -487,7 +487,7 @@ static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t m attrs.ia_gid = make_kgid(&init_user_ns, gid); attrs.ia_valid = ATTR_UID | ATTR_GID; mutex_lock(&d_inode(dent)->i_mutex); - notify_change(dent, &attrs, NULL); + notify_change2(parent.mnt, dent, &attrs, NULL); mutex_unlock(&d_inode(dent)->i_mutex); out_dput: From 6b6e896b0beed019c1fe4ad1b49c327d905956b8 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 17:36:05 -0700 Subject: [PATCH 0624/3220] ANDROID: sdcardfs: Add gid and mask to private mount data Adds support for mount2, remount2, and the functions to allocate/clone/copy the private data The next patch will switch over to actually using it. Change-Id: I8a43da26021d33401f655f0b2784ead161c575e3 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/main.c | 103 ++++++++++++++++++++++++++++++++++++----- fs/sdcardfs/sdcardfs.h | 8 ++++ fs/sdcardfs/super.c | 64 ++++++++++++++++++++++--- 3 files changed, 157 insertions(+), 18 deletions(-) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 2decea3d1e3e..5400e7e63d27 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -49,7 +49,8 @@ static const match_table_t sdcardfs_tokens = { }; static int parse_options(struct super_block *sb, char *options, int silent, - int *debug, struct sdcardfs_mount_options *opts) + int *debug, struct sdcardfs_vfsmount_options *vfsopts, + struct sdcardfs_mount_options *opts) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -58,9 +59,11 @@ static int parse_options(struct super_block *sb, char *options, int silent, /* by default, we use AID_MEDIA_RW as uid, gid */ opts->fs_low_uid = AID_MEDIA_RW; opts->fs_low_gid = AID_MEDIA_RW; + vfsopts->mask = 0; opts->mask = 0; opts->multiuser = false; opts->fs_user_id = 0; + vfsopts->gid = 0; opts->gid = 0; /* by default, 0MB is reserved */ opts->reserved_mb = 0; @@ -95,6 +98,7 @@ static int parse_options(struct super_block *sb, char *options, int silent, if (match_int(&args[0], &option)) return 0; opts->gid = option; + vfsopts->gid = option; break; case Opt_userid: if (match_int(&args[0], &option)) @@ -105,6 +109,7 @@ static int parse_options(struct super_block *sb, char *options, int silent, if (match_int(&args[0], &option)) return 0; opts->mask = option; + vfsopts->mask = option; break; case Opt_multiuser: opts->multiuser = true; @@ -135,6 +140,65 @@ static int parse_options(struct super_block *sb, char *options, int silent, return 0; } +int parse_options_remount(struct super_block *sb, char *options, int silent, + struct sdcardfs_vfsmount_options *vfsopts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + int debug; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, sdcardfs_tokens, args); + + switch (token) { + case Opt_debug: + debug = 1; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + vfsopts->gid = option; + + break; + case Opt_mask: + if (match_int(&args[0], &option)) + return 0; + vfsopts->mask = option; + break; + case Opt_multiuser: + case Opt_userid: + case Opt_fsuid: + case Opt_fsgid: + case Opt_reserved_mb: + printk( KERN_WARNING "Option \"%s\" can't be changed during remount\n", p); + break; + /* unknown option */ + default: + if (!silent) { + printk( KERN_ERR "Unrecognized mount option \"%s\" " + "or missing value", p); + } + return -EINVAL; + } + } + + if (debug) { + printk( KERN_INFO "sdcardfs : options - debug:%d\n", debug); + printk( KERN_INFO "sdcardfs : options - gid:%d\n", vfsopts->gid); + printk( KERN_INFO "sdcardfs : options - mask:%d\n", vfsopts->mask); + } + + return 0; +} + #if 0 /* * our custom d_alloc_root work-alike @@ -172,14 +236,15 @@ EXPORT_SYMBOL_GPL(sdcardfs_super_list); * There is no need to lock the sdcardfs_super_info's rwsem as there is no * way anyone can have a reference to the superblock at this point in time. */ -static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, - void *raw_data, int silent) +static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, + const char *dev_name, void *raw_data, int silent) { int err = 0; int debug; struct super_block *lower_sb; struct path lower_path; struct sdcardfs_sb_info *sb_info; + struct sdcardfs_vfsmount_options *mnt_opt = mnt->data; struct inode *inode; printk(KERN_INFO "sdcardfs version 2.0\n"); @@ -212,7 +277,7 @@ static int sdcardfs_read_super(struct super_block *sb, const char *dev_name, sb_info = sb->s_fs_info; /* parse options */ - err = parse_options(sb, raw_data, silent, &debug, &sb_info->options); + err = parse_options(sb, raw_data, silent, &debug, mnt_opt, &sb_info->options); if (err) { printk(KERN_ERR "sdcardfs: invalid options\n"); goto out_freesbi; @@ -306,9 +371,9 @@ out: } /* A feature which supports mount_nodev() with options */ -static struct dentry *mount_nodev_with_options(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, const char *, void *, int)) +static struct dentry *mount_nodev_with_options(struct vfsmount *mnt, + struct file_system_type *fs_type, int flags, const char *dev_name, void *data, + int (*fill_super)(struct vfsmount *, struct super_block *, const char *, void *, int)) { int error; @@ -319,7 +384,7 @@ static struct dentry *mount_nodev_with_options(struct file_system_type *fs_type, s->s_flags = flags; - error = fill_super(s, dev_name, data, flags & MS_SILENT ? 1 : 0); + error = fill_super(mnt, s, dev_name, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); @@ -328,15 +393,27 @@ static struct dentry *mount_nodev_with_options(struct file_system_type *fs_type, return dget(s->s_root); } -struct dentry *sdcardfs_mount(struct file_system_type *fs_type, int flags, +static struct dentry *sdcardfs_mount(struct vfsmount *mnt, + struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { /* * dev_name is a lower_path_name, * raw_data is a option string. */ - return mount_nodev_with_options(fs_type, flags, dev_name, - raw_data, sdcardfs_read_super); + return mount_nodev_with_options(mnt, fs_type, flags, dev_name, + raw_data, sdcardfs_read_super); +} + +static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + WARN(1, "sdcardfs does not support mount. Use mount2.\n"); + return ERR_PTR(-EINVAL); +} + +void *sdcardfs_alloc_mnt_data(void) { + return kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL); } void sdcardfs_kill_sb(struct super_block *sb) { @@ -353,7 +430,9 @@ void sdcardfs_kill_sb(struct super_block *sb) { static struct file_system_type sdcardfs_fs_type = { .owner = THIS_MODULE, .name = SDCARDFS_NAME, - .mount = sdcardfs_mount, + .mount = sdcardfs_mount_wrn, + .mount2 = sdcardfs_mount, + .alloc_mnt_data = sdcardfs_alloc_mnt_data, .kill_sb = sdcardfs_kill_sb, .fs_flags = 0, }; diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 5132f1dc5a4d..22ef29857022 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -193,6 +193,14 @@ struct sdcardfs_mount_options { unsigned int reserved_mb; }; +struct sdcardfs_vfsmount_options { + gid_t gid; + mode_t mask; +}; + +extern int parse_options_remount(struct super_block *sb, char *options, int silent, + struct sdcardfs_vfsmount_options *vfsopts); + /* sdcardfs super-block data in memory */ struct sdcardfs_sb_info { struct super_block *sb; diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index 0a465395aab7..edda32b68dc0 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -108,6 +108,50 @@ static int sdcardfs_remount_fs(struct super_block *sb, int *flags, char *options return err; } +/* + * @mnt: mount point we are remounting + * @sb: superblock we are remounting + * @flags: numeric mount options + * @options: mount options string + */ +static int sdcardfs_remount_fs2(struct vfsmount *mnt, struct super_block *sb, + int *flags, char *options) +{ + int err = 0; + + /* + * The VFS will take care of "ro" and "rw" flags among others. We + * can safely accept a few flags (RDONLY, MANDLOCK), and honor + * SILENT, but anything else left over is an error. + */ + if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT | MS_REMOUNT)) != 0) { + printk(KERN_ERR + "sdcardfs: remount flags 0x%x unsupported\n", *flags); + err = -EINVAL; + } + printk(KERN_INFO "Remount options were %s for vfsmnt %p.\n", options, mnt); + err = parse_options_remount(sb, options, *flags & ~MS_SILENT, mnt->data); + + + return err; +} + +static void* sdcardfs_clone_mnt_data(void *data) { + struct sdcardfs_vfsmount_options* opt = kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL); + struct sdcardfs_vfsmount_options* old = data; + if(!opt) return NULL; + opt->gid = old->gid; + opt->mask = old->mask; + return opt; +} + +static void sdcardfs_copy_mnt_data(void *data, void *newdata) { + struct sdcardfs_vfsmount_options* old = data; + struct sdcardfs_vfsmount_options* new = newdata; + old->gid = new->gid; + old->mask = new->mask; +} + /* * Called by iput() when the inode reference count reached zero * and the inode is not hashed anywhere. Used to clear anything @@ -191,19 +235,24 @@ static void sdcardfs_umount_begin(struct super_block *sb) lower_sb->s_op->umount_begin(lower_sb); } -static int sdcardfs_show_options(struct seq_file *m, struct dentry *root) +static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m, struct dentry *root) { struct sdcardfs_sb_info *sbi = SDCARDFS_SB(root->d_sb); struct sdcardfs_mount_options *opts = &sbi->options; + struct sdcardfs_vfsmount_options *vfsopts = mnt->data; if (opts->fs_low_uid != 0) - seq_printf(m, ",uid=%u", opts->fs_low_uid); + seq_printf(m, ",fsuid=%u", opts->fs_low_uid); if (opts->fs_low_gid != 0) - seq_printf(m, ",gid=%u", opts->fs_low_gid); - + seq_printf(m, ",fsgid=%u", opts->fs_low_gid); + if (vfsopts->gid != 0) + seq_printf(m, ",gid=%u", vfsopts->gid); if (opts->multiuser) seq_printf(m, ",multiuser"); - + if (vfsopts->mask) + seq_printf(m, ",mask=%u", vfsopts->mask); + if (opts->fs_user_id) + seq_printf(m, ",userid=%u", opts->fs_user_id); if (opts->reserved_mb != 0) seq_printf(m, ",reserved=%uMB", opts->reserved_mb); @@ -214,9 +263,12 @@ const struct super_operations sdcardfs_sops = { .put_super = sdcardfs_put_super, .statfs = sdcardfs_statfs, .remount_fs = sdcardfs_remount_fs, + .remount_fs2 = sdcardfs_remount_fs2, + .clone_mnt_data = sdcardfs_clone_mnt_data, + .copy_mnt_data = sdcardfs_copy_mnt_data, .evict_inode = sdcardfs_evict_inode, .umount_begin = sdcardfs_umount_begin, - .show_options = sdcardfs_show_options, + .show_options2 = sdcardfs_show_options, .alloc_inode = sdcardfs_alloc_inode, .destroy_inode = sdcardfs_destroy_inode, .drop_inode = generic_delete_inode, From 4d70f73115248761f35aaa9725a22e4b957d2938 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 20:27:20 -0700 Subject: [PATCH 0625/3220] ANDROID: sdcardfs: Use per mount permissions This switches sdcardfs over to using permission2. Instead of mounting several sdcardfs instances onto the same underlaying directory, you bind mount a single mount several times, and remount with the options you want. These are stored in the private mount data, allowing you to maintain the same tree, but have different permissions for different mount points. Warning functions have been added for permission, as it should never be called, and the correct behavior is unclear. Change-Id: I841b1d70ec60cf2b866fa48edeb74a0b0f8334f5 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/derived_perm.c | 20 ++++-- fs/sdcardfs/inode.c | 127 ++++++++++++++++++++++++++++++------- fs/sdcardfs/lookup.c | 4 +- fs/sdcardfs/main.c | 8 +-- fs/sdcardfs/sdcardfs.h | 44 ++++++++----- 5 files changed, 150 insertions(+), 53 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 89daf69efbaa..066edbbb6ad6 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -141,13 +141,23 @@ void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len) info = SDCARDFS_I(d_inode(dentry)); if (needs_fixup(info->perm)) { + /* We need permission to fix up these values. + * Since permissions are based of of the mount, and + * we are accessing without the mount point, we create + * a fake mount with the permissions we will be using. + */ + struct vfsmount fakemnt; + struct sdcardfs_vfsmount_options opts; + fakemnt.data = &opts; + opts.gid = AID_SDCARD_RW; + opts.mask = 0; mutex_lock(&d_inode(dentry)->i_mutex); - child = lookup_one_len(name, dentry, len); + child = lookup_one_len2(name, &fakemnt, dentry, len); mutex_unlock(&d_inode(dentry)->i_mutex); if (!IS_ERR(child)) { - if (child->d_inode) { + if (d_inode(child)) { get_derived_permission(dentry, child); - fix_derived_permission(d_inode(child)); + fixup_tmp_permissions(d_inode(child)); } dput(child); } @@ -172,7 +182,7 @@ void fixup_top_recursive(struct dentry *parent) { if (d_inode(dentry)) { if (SDCARDFS_I(d_inode(parent))->top != SDCARDFS_I(d_inode(dentry))->top) { get_derived_permission(parent, dentry); - fix_derived_permission(d_inode(dentry)); + fixup_tmp_permissions(d_inode(dentry)); fixup_top_recursive(dentry); } } @@ -202,7 +212,7 @@ inline void update_derived_permission_lock(struct dentry *dentry) dput(parent); } } - fix_derived_permission(dentry->d_inode); + fixup_tmp_permissions(d_inode(dentry)); } int need_graft_path(struct dentry *dentry) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index dc64c9e2f5e7..76a6e8ad0736 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -531,7 +531,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* At this point, not all dentry information has been moved, so * we pass along new_dentry for the name.*/ get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry); - fix_derived_permission(d_inode(old_dentry)); + fixup_tmp_permissions(d_inode(old_dentry)); fixup_top_recursive(old_dentry); out: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); @@ -601,26 +601,63 @@ static const char *sdcardfs_follow_link(struct dentry *dentry, void **cookie) } #endif -static int sdcardfs_permission(struct inode *inode, int mask) +static int sdcardfs_permission_wrn(struct inode *inode, int mask) +{ + WARN(1, "sdcardfs does not support permission. Use permission2.\n"); + return -EINVAL; +} + +void copy_attrs(struct inode *dest, const struct inode *src) +{ + dest->i_mode = src->i_mode; + dest->i_uid = src->i_uid; + dest->i_gid = src->i_gid; + dest->i_rdev = src->i_rdev; + dest->i_atime = src->i_atime; + dest->i_mtime = src->i_mtime; + dest->i_ctime = src->i_ctime; + dest->i_blkbits = src->i_blkbits; + dest->i_flags = src->i_flags; +#ifdef CONFIG_FS_POSIX_ACL + dest->i_acl = src->i_acl; +#endif +#ifdef CONFIG_SECURITY + dest->i_security = src->i_security; +#endif +} + +static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int mask) { int err; + struct inode tmp; struct inode *top = grab_top(SDCARDFS_I(inode)); - if (!top) + if (!top) { + release_top(SDCARDFS_I(inode)); + WARN(1, "Top value was null!\n"); return -EINVAL; - /* Ensure owner is up to date */ - if (!uid_eq(inode->i_uid, top->i_uid)) { - SDCARDFS_I(inode)->d_uid = SDCARDFS_I(top)->d_uid; - fix_derived_permission(inode); } - release_top(SDCARDFS_I(inode)); /* * Permission check on sdcardfs inode. * Calling process should have AID_SDCARD_RW permission + * Since generic_permission only needs i_mode, i_uid, + * i_gid, and i_sb, we can create a fake inode to pass + * this information down in. + * + * The underlying code may attempt to take locks in some + * cases for features we're not using, but if that changes, + * locks must be dealt with to avoid undefined behavior. */ - err = generic_permission(inode, mask); - + copy_attrs(&tmp, inode); + tmp.i_uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid); + tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top))); + tmp.i_mode = (inode->i_mode & S_IFMT) | get_mode(mnt, SDCARDFS_I(top)); + release_top(SDCARDFS_I(inode)); + tmp.i_sb = inode->i_sb; + if (IS_POSIXACL(inode)) + printk(KERN_WARNING "%s: This may be undefined behavior... \n", __func__); + err = generic_permission(&tmp, mask); /* XXX * Original sdcardfs code calls inode_permission(lower_inode,.. ) * for checking inode permission. But doing such things here seems @@ -649,7 +686,13 @@ static int sdcardfs_permission(struct inode *inode, int mask) } -static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) +static int sdcardfs_setattr_wrn(struct dentry *dentry, struct iattr *ia) +{ + WARN(1, "sdcardfs does not support setattr. User setattr2.\n"); + return -EINVAL; +} + +static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct iattr *ia) { int err; struct dentry *lower_dentry; @@ -659,17 +702,45 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) struct path lower_path; struct iattr lower_ia; struct dentry *parent; + struct inode tmp; + struct inode *top; + const struct cred *saved_cred = NULL; inode = d_inode(dentry); + top = grab_top(SDCARDFS_I(inode)); + + if (!top) { + release_top(SDCARDFS_I(inode)); + return -EINVAL; + } + + /* + * Permission check on sdcardfs inode. + * Calling process should have AID_SDCARD_RW permission + * Since generic_permission only needs i_mode, i_uid, + * i_gid, and i_sb, we can create a fake inode to pass + * this information down in. + * + * The underlying code may attempt to take locks in some + * cases for features we're not using, but if that changes, + * locks must be dealt with to avoid undefined behavior. + * + */ + copy_attrs(&tmp, inode); + tmp.i_uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid); + tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top))); + tmp.i_mode = (inode->i_mode & S_IFMT) | get_mode(mnt, SDCARDFS_I(top)); + tmp.i_size = i_size_read(inode); + release_top(SDCARDFS_I(inode)); + tmp.i_sb = inode->i_sb; /* * Check if user has permission to change inode. We don't check if * this user can change the lower inode: that should happen when * calling notify_change on the lower inode. */ - err = inode_change_ok(inode, ia); + err = inode_change_ok(&tmp, ia); - /* no vfs_XXX operations required, cred overriding will be skipped. wj*/ if (!err) { /* check the Android group ID */ parent = dget_parent(dentry); @@ -685,6 +756,9 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) if (err) goto out_err; + /* save current_cred and override it */ + OVERRIDE_CRED(SDCARDFS_SB(dentry->d_sb), saved_cred); + sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; lower_mnt = lower_path.mnt; @@ -708,7 +782,7 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) if (current->mm) down_write(¤t->mm->mmap_sem); if (ia->ia_valid & ATTR_SIZE) { - err = inode_newsize_ok(inode, ia->ia_size); + err = inode_newsize_ok(&tmp, ia->ia_size); if (err) { if (current->mm) up_write(¤t->mm->mmap_sem); @@ -750,11 +824,12 @@ static int sdcardfs_setattr(struct dentry *dentry, struct iattr *ia) out: sdcardfs_put_lower_path(dentry, &lower_path); + REVERT_CRED(saved_cred); out_err: return err; } -static int sdcardfs_fillattr(struct inode *inode, struct kstat *stat) +static int sdcardfs_fillattr(struct vfsmount *mnt, struct inode *inode, struct kstat *stat) { struct sdcardfs_inode_info *info = SDCARDFS_I(inode); struct inode *top = grab_top(info); @@ -763,10 +838,10 @@ static int sdcardfs_fillattr(struct inode *inode, struct kstat *stat) stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; - stat->mode = (inode->i_mode & S_IFMT) | get_mode(SDCARDFS_I(top)); + stat->mode = (inode->i_mode & S_IFMT) | get_mode(mnt, SDCARDFS_I(top)); stat->nlink = inode->i_nlink; stat->uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid); - stat->gid = make_kgid(&init_user_ns, get_gid(SDCARDFS_I(top))); + stat->gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top))); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode->i_atime; @@ -807,14 +882,14 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, sdcardfs_copy_and_fix_attrs(inode, lower_inode); fsstack_copy_inode_size(inode, lower_inode); - err = sdcardfs_fillattr(inode, stat); + err = sdcardfs_fillattr(mnt, inode, stat); sdcardfs_put_lower_path(dentry, &lower_path); return err; } const struct inode_operations sdcardfs_symlink_iops = { - .permission = sdcardfs_permission, - .setattr = sdcardfs_setattr, + .permission2 = sdcardfs_permission, + .setattr2 = sdcardfs_setattr, /* XXX Following operations are implemented, * but FUSE(sdcard) or FAT does not support them * These methods are *NOT* perfectly tested. @@ -827,12 +902,14 @@ const struct inode_operations sdcardfs_symlink_iops = { const struct inode_operations sdcardfs_dir_iops = { .create = sdcardfs_create, .lookup = sdcardfs_lookup, - .permission = sdcardfs_permission, + .permission = sdcardfs_permission_wrn, + .permission2 = sdcardfs_permission, .unlink = sdcardfs_unlink, .mkdir = sdcardfs_mkdir, .rmdir = sdcardfs_rmdir, .rename = sdcardfs_rename, - .setattr = sdcardfs_setattr, + .setattr = sdcardfs_setattr_wrn, + .setattr2 = sdcardfs_setattr, .getattr = sdcardfs_getattr, /* XXX Following operations are implemented, * but FUSE(sdcard) or FAT does not support them @@ -844,7 +921,9 @@ const struct inode_operations sdcardfs_dir_iops = { }; const struct inode_operations sdcardfs_main_iops = { - .permission = sdcardfs_permission, - .setattr = sdcardfs_setattr, + .permission = sdcardfs_permission_wrn, + .permission2 = sdcardfs_permission, + .setattr = sdcardfs_setattr_wrn, + .setattr2 = sdcardfs_setattr, .getattr = sdcardfs_getattr, }; diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index c74a7d1bc18e..00a711ec2733 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -244,6 +244,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, if (err == -ENOENT) { struct dentry *child; struct dentry *match = NULL; + mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); spin_lock(&lower_dir_dentry->d_lock); list_for_each_entry(child, &lower_dir_dentry->d_subdirs, d_child) { if (child && d_inode(child)) { @@ -254,6 +255,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, } } spin_unlock(&lower_dir_dentry->d_lock); + mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); if (match) { err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, @@ -389,7 +391,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, sdcardfs_lower_inode(dentry->d_inode)); /* get derived permission */ get_derived_permission(parent, dentry); - fix_derived_permission(dentry->d_inode); + fixup_tmp_permissions(d_inode(dentry)); } /* update parent directory's atime */ fsstack_copy_attr_atime(parent->d_inode, diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 5400e7e63d27..eec10ccacd99 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -28,7 +28,6 @@ enum { Opt_fsgid, Opt_gid, Opt_debug, - Opt_lower_fs, Opt_mask, Opt_multiuser, // May need? Opt_userid, @@ -60,11 +59,9 @@ static int parse_options(struct super_block *sb, char *options, int silent, opts->fs_low_uid = AID_MEDIA_RW; opts->fs_low_gid = AID_MEDIA_RW; vfsopts->mask = 0; - opts->mask = 0; opts->multiuser = false; opts->fs_user_id = 0; vfsopts->gid = 0; - opts->gid = 0; /* by default, 0MB is reserved */ opts->reserved_mb = 0; @@ -97,7 +94,6 @@ static int parse_options(struct super_block *sb, char *options, int silent, case Opt_gid: if (match_int(&args[0], &option)) return 0; - opts->gid = option; vfsopts->gid = option; break; case Opt_userid: @@ -108,7 +104,6 @@ static int parse_options(struct super_block *sb, char *options, int silent, case Opt_mask: if (match_int(&args[0], &option)) return 0; - opts->mask = option; vfsopts->mask = option; break; case Opt_multiuser: @@ -258,6 +253,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, printk(KERN_INFO "sdcardfs: dev_name -> %s\n", dev_name); printk(KERN_INFO "sdcardfs: options -> %s\n", (char *)raw_data); + printk(KERN_INFO "sdcardfs: mnt -> %p\n", mnt); /* parse lower path */ err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, @@ -342,7 +338,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, setup_derived_state(d_inode(sb->s_root), PERM_ROOT, sb_info->options.fs_user_id, AID_ROOT, false, d_inode(sb->s_root)); snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name); } - fix_derived_permission(sb->s_root->d_inode); + fixup_tmp_permissions(d_inode(sb->s_root)); sb_info->sb = sb; list_add(&sb_info->list, &sdcardfs_super_list); mutex_unlock(&sdcardfs_super_list_lock); diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 22ef29857022..b03130329014 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -68,14 +68,20 @@ #define AID_PACKAGE_INFO 1027 -#define fix_derived_permission(x) \ + +/* + * Permissions are handled by our permission function. + * We don't want anyone who happens to look at our inode value to prematurely + * block access, so store more permissive values. These are probably never + * used. + */ +#define fixup_tmp_permissions(x) \ do { \ (x)->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(x)->d_uid); \ - (x)->i_gid = make_kgid(&init_user_ns, get_gid(SDCARDFS_I(x))); \ - (x)->i_mode = ((x)->i_mode & S_IFMT) | get_mode(SDCARDFS_I(x));\ + (x)->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW); \ + (x)->i_mode = ((x)->i_mode & S_IFMT) | 0775;\ } while (0) - /* OVERRIDE_CRED() and REVERT_CRED() * OVERRID_CRED() * backup original task->cred @@ -187,8 +193,6 @@ struct sdcardfs_mount_options { uid_t fs_low_uid; gid_t fs_low_gid; userid_t fs_user_id; - gid_t gid; - mode_t mask; bool multiuser; unsigned int reserved_mb; }; @@ -360,9 +364,10 @@ static inline void release_top(struct sdcardfs_inode_info *info) iput(info->top); } -static inline int get_gid(struct sdcardfs_inode_info *info) { - struct sdcardfs_sb_info *sb_info = SDCARDFS_SB(info->vfs_inode.i_sb); - if (sb_info->options.gid == AID_SDCARD_RW) { +static inline int get_gid(struct vfsmount *mnt, struct sdcardfs_inode_info *info) { + struct sdcardfs_vfsmount_options *opts = mnt->data; + + if (opts->gid == AID_SDCARD_RW) { /* As an optimization, certain trusted system components only run * as owner but operate across all users. Since we're now handing * out the sdcard_rw GID only to trusted apps, we're okay relaxing @@ -370,14 +375,15 @@ static inline int get_gid(struct sdcardfs_inode_info *info) { * assigned to app directories are still multiuser aware. */ return AID_SDCARD_RW; } else { - return multiuser_get_uid(info->userid, sb_info->options.gid); + return multiuser_get_uid(info->userid, opts->gid); } } -static inline int get_mode(struct sdcardfs_inode_info *info) { +static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *info) { int owner_mode; int filtered_mode; - struct sdcardfs_sb_info * sb_info = SDCARDFS_SB(info->vfs_inode.i_sb); - int visible_mode = 0775 & ~sb_info->options.mask; + struct sdcardfs_vfsmount_options *opts = mnt->data; + int visible_mode = 0775 & ~opts->mask; + if (info->perm == PERM_PRE_ROOT) { /* Top of multi-user view should always be visible to ensure @@ -387,7 +393,7 @@ static inline int get_mode(struct sdcardfs_inode_info *info) { /* Block "other" access to Android directories, since only apps * belonging to a specific user should be in there; we still * leave +x open for the default view. */ - if (sb_info->options.gid == AID_SDCARD_RW) { + if (opts->gid == AID_SDCARD_RW) { visible_mode = visible_mode & ~0006; } else { visible_mode = visible_mode & ~0007; @@ -553,12 +559,16 @@ static inline int check_min_free_space(struct dentry *dentry, size_t size, int d return 1; } -/* Copies attrs and maintains sdcardfs managed attrs */ +/* + * Copies attrs and maintains sdcardfs managed attrs + * Since our permission check handles all special permissions, set those to be open + */ static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct inode *src) { - dest->i_mode = (src->i_mode & S_IFMT) | get_mode(SDCARDFS_I(dest)); + dest->i_mode = (src->i_mode & S_IFMT) | S_IRWXU | S_IRWXG | + S_IROTH | S_IXOTH; /* 0775 */ dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->d_uid); - dest->i_gid = make_kgid(&init_user_ns, get_gid(SDCARDFS_I(dest))); + dest->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW); dest->i_rdev = src->i_rdev; dest->i_atime = src->i_atime; dest->i_mtime = src->i_mtime; From 9eaefe628d1b144ed0893418639df1ef6cc1aac6 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 15 Nov 2016 13:35:18 -0800 Subject: [PATCH 0626/3220] ANDROID: sdcardfs: Change magic value Sdcardfs uses the same magic value as wrapfs. This should not be the case. As it is entirely in memory, the value can be changed without any loss of compatibility. Change-Id: I24200b805d5e6d32702638be99e47d50d7f2f746 Signed-off-by: Daniel Rosenberg --- include/uapi/linux/magic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index cfb5c406f344..e97d5b3ccfa8 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -52,7 +52,7 @@ #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" #define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs" -#define SDCARDFS_SUPER_MAGIC 0xb550ca10 +#define SDCARDFS_SUPER_MAGIC 0x5dca2df5 #define SMB_SUPER_MAGIC 0x517B #define CGROUP_SUPER_MAGIC 0x27e0eb From 5cb5648935924fde6366062fc3a144ba06bfbd17 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 27 Dec 2016 12:36:29 -0800 Subject: [PATCH 0627/3220] ANDROID: sdcardfs: Fix locking issue with permision fix up Don't use lookup_one_len so we can grab the spinlock that protects d_subdirs. Bug: 30954918 Change-Id: I0c6a393252db7beb467e0d563739a3a14e1b5115 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/derived_perm.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 066edbbb6ad6..c77695c8f729 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -141,32 +141,26 @@ void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len) info = SDCARDFS_I(d_inode(dentry)); if (needs_fixup(info->perm)) { - /* We need permission to fix up these values. - * Since permissions are based of of the mount, and - * we are accessing without the mount point, we create - * a fake mount with the permissions we will be using. - */ - struct vfsmount fakemnt; - struct sdcardfs_vfsmount_options opts; - fakemnt.data = &opts; - opts.gid = AID_SDCARD_RW; - opts.mask = 0; - mutex_lock(&d_inode(dentry)->i_mutex); - child = lookup_one_len2(name, &fakemnt, dentry, len); - mutex_unlock(&d_inode(dentry)->i_mutex); - if (!IS_ERR(child)) { - if (d_inode(child)) { - get_derived_permission(dentry, child); - fixup_tmp_permissions(d_inode(child)); - } - dput(child); + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_child) { + dget(child); + if (!strncasecmp(child->d_name.name, name, len)) { + if (d_inode(child)) { + get_derived_permission(dentry, child); + fixup_tmp_permissions(d_inode(child)); + dput(child); + break; + } + } + dput(child); } + spin_unlock(&dentry->d_lock); } else if (descendant_may_need_fixup(info->perm)) { - mutex_lock(&d_inode(dentry)->i_mutex); + spin_lock(&dentry->d_lock); list_for_each_entry(child, &dentry->d_subdirs, d_child) { fixup_perms_recursive(child, name, len); } - mutex_unlock(&d_inode(dentry)->i_mutex); + spin_unlock(&dentry->d_lock); } dput(dentry); } From e33aa348eec662cc9dfb078445dda1644b294a86 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 5 Jan 2017 14:37:11 -0800 Subject: [PATCH 0628/3220] ANDROID: sdcardfs: Switch ->d_inode to d_inode() Change-Id: I12375cc2d6e82fb8adf0319be971f335f8d7a312 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/derived_perm.c | 16 ++++++++-------- fs/sdcardfs/file.c | 2 +- fs/sdcardfs/lookup.c | 14 +++++++------- fs/sdcardfs/main.c | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index c77695c8f729..0c1a91f70b40 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -49,8 +49,8 @@ void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, /* While renaming, there is a point where we want the path from dentry, but the name from newdentry */ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry) { - struct sdcardfs_inode_info *info = SDCARDFS_I(dentry->d_inode); - struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode); + struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry)); + struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent)); appid_t appid; /* By default, each inode inherits from its parent. @@ -61,7 +61,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st * stage of each system call by fix_derived_permission(inode). */ - inherit_derived_state(parent->d_inode, dentry->d_inode); + inherit_derived_state(d_inode(parent), d_inode(dentry)); /* Derive custom permissions based on parent and current node */ switch (parent_info->perm) { @@ -134,7 +134,7 @@ void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len) struct sdcardfs_inode_info *info; if (!dget(dentry)) return; - if (!dentry->d_inode) { + if (!d_inode(dentry)) { dput(dentry); return; } @@ -189,7 +189,7 @@ inline void update_derived_permission_lock(struct dentry *dentry) { struct dentry *parent; - if(!dentry || !dentry->d_inode) { + if(!dentry || !d_inode(dentry)) { printk(KERN_ERR "sdcardfs: %s: invalid dentry\n", __func__); return; } @@ -198,7 +198,7 @@ inline void update_derived_permission_lock(struct dentry *dentry) * 2. remove the root dentry update */ if(IS_ROOT(dentry)) { - //setup_default_pre_root_state(dentry->d_inode); + //setup_default_pre_root_state(d_inode(dentry)); } else { parent = dget_parent(dentry); if(parent) { @@ -213,7 +213,7 @@ int need_graft_path(struct dentry *dentry) { int ret = 0; struct dentry *parent = dget_parent(dentry); - struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode); + struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent)); struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); if(parent_info->perm == PERM_ANDROID && @@ -272,7 +272,7 @@ int is_base_obbpath(struct dentry *dentry) { int ret = 0; struct dentry *parent = dget_parent(dentry); - struct sdcardfs_inode_info *parent_info= SDCARDFS_I(parent->d_inode); + struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent)); struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); spin_lock(&SDCARDFS_D(dentry)->lock); diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index c249fa982d3c..7750a0472389 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -216,7 +216,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file) goto out_err; } - if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name)) { + if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 00a711ec2733..e94a65c8bbbd 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -179,7 +179,7 @@ int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb, struct inode *lower_inode; struct super_block *lower_sb; - lower_inode = lower_path->dentry->d_inode; + lower_inode = d_inode(lower_path->dentry); lower_sb = sdcardfs_lower_super(sb); /* check that the lower file system didn't cross a mount point */ @@ -359,7 +359,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, parent = dget_parent(dentry); - if(!check_caller_access_to_name(parent->d_inode, dentry->d_name.name)) { + if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) { ret = ERR_PTR(-EACCES); printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", @@ -386,16 +386,16 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, } if (ret) dentry = ret; - if (dentry->d_inode) { - fsstack_copy_attr_times(dentry->d_inode, - sdcardfs_lower_inode(dentry->d_inode)); + if (d_inode(dentry)) { + fsstack_copy_attr_times(d_inode(dentry), + sdcardfs_lower_inode(d_inode(dentry))); /* get derived permission */ get_derived_permission(parent, dentry); fixup_tmp_permissions(d_inode(dentry)); } /* update parent directory's atime */ - fsstack_copy_attr_atime(parent->d_inode, - sdcardfs_lower_inode(parent->d_inode)); + fsstack_copy_attr_atime(d_inode(parent), + sdcardfs_lower_inode(d_inode(parent))); out: sdcardfs_put_lower_path(parent, &lower_parent_path); diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index eec10ccacd99..7a8eae29e44d 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -297,7 +297,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, sb->s_op = &sdcardfs_sops; /* get a new inode and allocate our root dentry */ - inode = sdcardfs_iget(sb, lower_path.dentry->d_inode, 0); + inode = sdcardfs_iget(sb, d_inode(lower_path.dentry), 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_sput; From b5858221c1c4f4bdc9ef67eb75ecf22580368820 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 5 Jan 2017 14:37:11 -0800 Subject: [PATCH 0629/3220] ANDROID: mnt: remount should propagate to slaves of slaves propagate_remount was not accounting for the slave mounts of other slave mounts, leading to some namespaces not recieving the remount information. bug:33731928 Change-Id: Idc9e8c2ed126a4143229fc23f10a959c2d0a3854 Signed-off-by: Daniel Rosenberg --- fs/pnode.c | 27 +++++++++++++++++++++------ fs/pnode.h | 2 +- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/pnode.c b/fs/pnode.c index 4e2d78ec053a..7bb1879e8442 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -451,16 +451,31 @@ int propagate_umount(struct list_head *list) return 0; } -int propagate_remount(struct mount *mnt) { - struct mount *m; +/* + * Iterates over all slaves, and slaves of slaves. + */ +static struct mount *next_descendent(struct mount *root, struct mount *cur) +{ + if (!IS_MNT_NEW(cur) && !list_empty(&cur->mnt_slave_list)) + return first_slave(cur); + do { + if (cur->mnt_slave.next != &cur->mnt_master->mnt_slave_list) + return next_slave(cur); + cur = cur->mnt_master; + } while (cur != root); + return NULL; +} + +void propagate_remount(struct mount *mnt) +{ + struct mount *m = mnt; struct super_block *sb = mnt->mnt.mnt_sb; - int ret = 0; if (sb->s_op->copy_mnt_data) { - for (m = first_slave(mnt); m->mnt_slave.next != &mnt->mnt_slave_list; m = next_slave(m)) { + m = next_descendent(mnt, m); + while (m) { sb->s_op->copy_mnt_data(m->mnt.data, mnt->mnt.data); + m = next_descendent(mnt, m); } } - - return ret; } diff --git a/fs/pnode.h b/fs/pnode.h index 4e8e94dc9e6a..3cb58c0cdcbc 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -44,7 +44,7 @@ int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, int propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); void propagate_mount_unlock(struct mount *); -int propagate_remount(struct mount *); +void propagate_remount(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); From d240a0a145287caab3601ac48c55d696488ebf1c Mon Sep 17 00:00:00 2001 From: Cameron Gutman Date: Thu, 23 Jun 2016 10:24:42 -0700 Subject: [PATCH 0630/3220] BACKPORT: Input: xpad - fix oops when attaching an unknown Xbox One gamepad Xbox One controllers have multiple interfaces which all have the same class, subclass, and protocol. One of the these interfaces has only a single endpoint. When Xpad attempts to bind to this interface, it causes an oops when trying initialize the output URB by trying to access the second endpoint's descriptor. This situation was avoided for known Xbox One devices by checking the XTYPE constant associated with the VID and PID tuple. However, this breaks when new or previously unknown Xbox One controllers are attached to the system. This change addresses the problem by deriving the XTYPE for Xbox One controllers based on the interface protocol before checking the interface number. Change-Id: If15a19cde514ffdeddb506da9c4d34479408005a Fixes: 1a48ff81b391 ("Input: xpad - add support for Xbox One controllers") Signed-off-by: Cameron Gutman Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index fd4100d56d8c..35e444b4b8b0 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -1206,16 +1206,6 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id break; } - if (xpad_device[i].xtype == XTYPE_XBOXONE && - intf->cur_altsetting->desc.bInterfaceNumber != 0) { - /* - * The Xbox One controller lists three interfaces all with the - * same interface class, subclass and protocol. Differentiate by - * interface number. - */ - return -ENODEV; - } - xpad = kzalloc(sizeof(struct usb_xpad), GFP_KERNEL); if (!xpad) return -ENOMEM; @@ -1246,6 +1236,8 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id if (intf->cur_altsetting->desc.bInterfaceClass == USB_CLASS_VENDOR_SPEC) { if (intf->cur_altsetting->desc.bInterfaceProtocol == 129) xpad->xtype = XTYPE_XBOX360W; + else if (intf->cur_altsetting->desc.bInterfaceProtocol == 208) + xpad->xtype = XTYPE_XBOXONE; else xpad->xtype = XTYPE_XBOX360; } else { @@ -1260,6 +1252,17 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id xpad->mapping |= MAP_STICKS_TO_NULL; } + if (xpad->xtype == XTYPE_XBOXONE && + intf->cur_altsetting->desc.bInterfaceNumber != 0) { + /* + * The Xbox One controller lists three interfaces all with the + * same interface class, subclass and protocol. Differentiate by + * interface number. + */ + error = -ENODEV; + goto err_free_in_urb; + } + error = xpad_init_output(intf, xpad); if (error) goto err_free_in_urb; From a07b34771ace2077f218aa6eb30f4107495a6c2e Mon Sep 17 00:00:00 2001 From: Cameron Gutman Date: Wed, 29 Jun 2016 09:51:35 -0700 Subject: [PATCH 0631/3220] BACKPORT: Input: xpad - validate USB endpoint count during probe This prevents a malicious USB device from causing an oops. Change-Id: I47c27541a4c2f0cec354eb83b3013bb825ed6e90 Signed-off-by: Cameron Gutman Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 35e444b4b8b0..2b2f9d66c2c7 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -1200,6 +1200,9 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id int ep_irq_in_idx; int i, error; + if (intf->cur_altsetting->desc.bNumEndpoints != 2) + return -ENODEV; + for (i = 0; xpad_device[i].idVendor; i++) { if ((le16_to_cpu(udev->descriptor.idVendor) == xpad_device[i].idVendor) && (le16_to_cpu(udev->descriptor.idProduct) == xpad_device[i].idProduct)) From 1cd3d347147bee1b8a3fb7624ab23eb3bdcece41 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 30 Jan 2017 12:26:08 -0800 Subject: [PATCH 0632/3220] ANDROID: fs: Export free_fs_struct and set_fs_pwd allmodconfig builds fail with: ERROR: "free_fs_struct" undefined! ERROR: "set_fs_pwd" undefined! Export the missing symbols. Change-Id: I4877ead19d7e7f0c93d4c4cad5681364284323aa Fixes: 0ec03f845799 ("ANDROID: sdcardfs: override umask on mkdir and create") Signed-off-by: Guenter Roeck --- fs/fs_struct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 005dcb401369..940c683561dd 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -44,6 +44,7 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path) if (old_pwd.dentry) path_put(&old_pwd); } +EXPORT_SYMBOL(set_fs_pwd); static inline int replace_path(struct path *p, const struct path *old, const struct path *new) { @@ -89,6 +90,7 @@ void free_fs_struct(struct fs_struct *fs) path_put(&fs->pwd); kmem_cache_free(fs_cachep, fs); } +EXPORT_SYMBOL(free_fs_struct); void exit_fs(struct task_struct *tsk) { From 4c7fc336f6e3ec34af2c6931fce4d496f560da90 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 30 Jan 2017 12:29:00 -0800 Subject: [PATCH 0633/3220] ANDROID: fs: Export vfs_rmdir2 allmodconfig builds fail with ERROR: "vfs_rmdir2" undefined! Export the missing function. Change-Id: I983d327e59fd34e0484f3c54d925e97d3905c19c Fixes: f9cb61dcb00c ("ANDROID: sdcardfs: User new permission2 functions") Signed-off-by: Guenter Roeck --- fs/namei.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 0230bc59f344..a896620f6d79 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3742,6 +3742,8 @@ out: d_delete(dentry); return error; } +EXPORT_SYMBOL(vfs_rmdir2); + int vfs_rmdir(struct inode *dir, struct dentry *dentry) { return vfs_rmdir2(NULL, dir, dentry); From df3087d4836dd35c138c628c50b0986169183ee9 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Wed, 1 Feb 2017 12:53:45 +0530 Subject: [PATCH 0634/3220] ANDROID: binder: fix format specifier for type binder_size_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix following warning on 32bit ARCH build: CC drivers/android/binder.o drivers/android/binder.c: In function ‘binder_transaction’: ./include/linux/kern_levels.h:4:18: warning: format ‘%lld’ expects argument of type ‘long long int’, but argument 4 has type ‘binder_size_t {aka unsigned int}’ [-Wformat=] drivers/android/binder.c:2047:3: note: in expansion of macro ‘binder_user_error’ binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n", ^ Change-Id: I943d0d4d54f7f2a019900cc18e55bed661bec5a5 Fixes: Change-Id: I02417f28cff14688f2e1d6fcb959438fd96566cc (android: binder: support for scatter-gather.") Signed-off-by: Amit Pundir --- drivers/android/binder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 951393825261..2196244c9647 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2047,7 +2047,7 @@ static void binder_transaction(struct binder_proc *proc, if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) { binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n", proc->pid, thread->pid, - extra_buffers_size); + (u64)extra_buffers_size); return_error = BR_FAILED_REPLY; goto err_bad_offset; } From 3e4f5484dcd596661af85bb28e96d946255e99f4 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 20 Jan 2017 15:19:13 -0800 Subject: [PATCH 0635/3220] ANDROID: sdcardfs: Allow non-owners to touch This modifies the permission checks in setattr to allow for non-owners to modify the timestamp of files to things other than the current time. This still requires write access, as enforced by the permission call, but relaxes the requirement that the caller must be the owner, allowing those with group permissions to change it as well. Bug: 11118565 Change-Id: Ied31f0cce2797675c7ef179eeb4e088185adcbad Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 76a6e8ad0736..6f450e523675 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -739,6 +739,11 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct * this user can change the lower inode: that should happen when * calling notify_change on the lower inode. */ + /* prepare our own lower struct iattr (with the lower file) */ + memcpy(&lower_ia, ia, sizeof(lower_ia)); + /* Allow touch updating timestamps. A previous permission check ensures + * we have write access. Changes to mode, owner, and group are ignored*/ + ia->ia_valid |= ATTR_FORCE; err = inode_change_ok(&tmp, ia); if (!err) { @@ -764,8 +769,6 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct lower_mnt = lower_path.mnt; lower_inode = sdcardfs_lower_inode(inode); - /* prepare our own lower struct iattr (with the lower file) */ - memcpy(&lower_ia, ia, sizeof(lower_ia)); if (ia->ia_valid & ATTR_FILE) lower_ia.ia_file = sdcardfs_lower_file(ia->ia_file); From d9300a998d196f15ad750f2ce568c37a49e3d82e Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Sat, 21 Jan 2017 00:35:26 -0800 Subject: [PATCH 0636/3220] ANDROID: sdcardfs: Refactor configfs interface This refactors the configfs code to be more easily extended. It will allow additional files to be added easily. Bug: 34542611 Bug: 34262585 Change-Id: I73c9b0ae5ca7eb27f4ebef3e6807f088b512d539 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/packagelist.c | 133 +++++++++++++++----------------------- 1 file changed, 53 insertions(+), 80 deletions(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 03776fa5f26c..0b3fb50b1fe4 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -220,26 +220,24 @@ static void packagelist_destroy(void) printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n"); } -struct package_appid { +struct package_details { struct config_item item; - int add_pid; + const char *name; }; -static inline struct package_appid *to_package_appid(struct config_item *item) +static inline struct package_details *to_package_details(struct config_item *item) { - return item ? container_of(item, struct package_appid, item) : NULL; + return item ? container_of(item, struct package_details, item) : NULL; } -static ssize_t package_appid_attr_show(struct config_item *item, - char *page) +static ssize_t package_details_appid_show(struct config_item *item, char *page) { - return scnprintf(page, PAGE_SIZE, "%u\n", get_appid(item->ci_name)); + return scnprintf(page, PAGE_SIZE, "%u\n", get_appid(to_package_details(item)->name)); } -static ssize_t package_appid_attr_store(struct config_item *item, +static ssize_t package_details_appid_store(struct config_item *item, const char *page, size_t count) { - struct package_appid *package_appid = to_package_appid(item); unsigned int tmp; int ret; @@ -247,73 +245,60 @@ static ssize_t package_appid_attr_store(struct config_item *item, if (ret) return ret; - ret = insert_packagelist_entry(item->ci_name, tmp); - package_appid->add_pid = tmp; + ret = insert_packagelist_entry(to_package_details(item)->name, tmp); + if (ret) return ret; return count; } -static struct configfs_attribute package_appid_attr_add_pid = { - .ca_owner = THIS_MODULE, - .ca_name = "appid", - .ca_mode = S_IRUGO | S_IWUGO, - .show = package_appid_attr_show, - .store = package_appid_attr_store, -}; +static void package_details_release(struct config_item *item) +{ + struct package_details *package_details = to_package_details(item); + printk(KERN_INFO "sdcardfs: removing %s\n", package_details->name); + remove_packagelist_entry(package_details->name); + kfree(package_details->name); + kfree(package_details); +} -static struct configfs_attribute *package_appid_attrs[] = { - &package_appid_attr_add_pid, +CONFIGFS_ATTR(package_details_, appid); + +static struct configfs_attribute *package_details_attrs[] = { + &package_details_attr_appid, NULL, }; -static void package_appid_release(struct config_item *item) -{ - printk(KERN_INFO "sdcardfs: removing %s\n", item->ci_dentry->d_name.name); - /* item->ci_name is freed already, so we rely on the dentry */ - remove_packagelist_entry(item->ci_dentry->d_name.name); - kfree(to_package_appid(item)); -} - -static struct configfs_item_operations package_appid_item_ops = { - .release = package_appid_release, +static struct configfs_item_operations package_details_item_ops = { + .release = package_details_release, }; static struct config_item_type package_appid_type = { - .ct_item_ops = &package_appid_item_ops, - .ct_attrs = package_appid_attrs, + .ct_item_ops = &package_details_item_ops, + .ct_attrs = package_details_attrs, .ct_owner = THIS_MODULE, }; - -struct sdcardfs_packages { - struct config_group group; -}; - -static inline struct sdcardfs_packages *to_sdcardfs_packages(struct config_item *item) +static struct config_item *packages_make_item(struct config_group *group, const char *name) { - return item ? container_of(to_config_group(item), struct sdcardfs_packages, group) : NULL; -} + struct package_details *package_details; -static struct config_item *sdcardfs_packages_make_item(struct config_group *group, const char *name) -{ - struct package_appid *package_appid; - - package_appid = kzalloc(sizeof(struct package_appid), GFP_KERNEL); - if (!package_appid) + package_details = kzalloc(sizeof(struct package_details), GFP_KERNEL); + if (!package_details) return ERR_PTR(-ENOMEM); + package_details->name = kstrdup(name, GFP_KERNEL); + if (!package_details->name) { + kfree(package_details); + return ERR_PTR(-ENOMEM); + } - config_item_init_type_name(&package_appid->item, name, + config_item_init_type_name(&package_details->item, name, &package_appid_type); - package_appid->add_pid = 0; - - return &package_appid->item; + return &package_details->item; } -static ssize_t packages_attr_show(struct config_item *item, - char *page) +static ssize_t packages_list_show(struct config_item *item, char *page) { struct hashtable_entry *hash_cur; int i; @@ -335,49 +320,37 @@ static ssize_t packages_attr_show(struct config_item *item, return count; } -static struct configfs_attribute sdcardfs_packages_attr_description = { - .ca_owner = THIS_MODULE, - .ca_name = "packages_gid.list", - .ca_mode = S_IRUGO, - .show = packages_attr_show, +static struct configfs_attribute packages_attr_packages_gid_list = { + .ca_name = "packages_gid.list", + .ca_mode = S_IRUGO, + .ca_owner = THIS_MODULE, + .show = packages_list_show, }; -static struct configfs_attribute *sdcardfs_packages_attrs[] = { - &sdcardfs_packages_attr_description, +static struct configfs_attribute *packages_attrs[] = { + &packages_attr_packages_gid_list, NULL, }; -static void sdcardfs_packages_release(struct config_item *item) -{ - - printk(KERN_INFO "sdcardfs: destroyed something?\n"); - kfree(to_sdcardfs_packages(item)); -} - -static struct configfs_item_operations sdcardfs_packages_item_ops = { - .release = sdcardfs_packages_release, -}; - /* * Note that, since no extra work is required on ->drop_item(), * no ->drop_item() is provided. */ -static struct configfs_group_operations sdcardfs_packages_group_ops = { - .make_item = sdcardfs_packages_make_item, +static struct configfs_group_operations packages_group_ops = { + .make_item = packages_make_item, }; -static struct config_item_type sdcardfs_packages_type = { - .ct_item_ops = &sdcardfs_packages_item_ops, - .ct_group_ops = &sdcardfs_packages_group_ops, - .ct_attrs = sdcardfs_packages_attrs, +static struct config_item_type packages_type = { + .ct_group_ops = &packages_group_ops, + .ct_attrs = packages_attrs, .ct_owner = THIS_MODULE, }; -static struct configfs_subsystem sdcardfs_packages_subsys = { +static struct configfs_subsystem sdcardfs_packages = { .su_group = { .cg_item = { .ci_namebuf = "sdcardfs", - .ci_type = &sdcardfs_packages_type, + .ci_type = &packages_type, }, }, }; @@ -385,7 +358,7 @@ static struct configfs_subsystem sdcardfs_packages_subsys = { static int configfs_sdcardfs_init(void) { int ret; - struct configfs_subsystem *subsys = &sdcardfs_packages_subsys; + struct configfs_subsystem *subsys = &sdcardfs_packages; config_group_init(&subsys->su_group); mutex_init(&subsys->su_mutex); @@ -400,7 +373,7 @@ static int configfs_sdcardfs_init(void) static void configfs_sdcardfs_exit(void) { - configfs_unregister_subsystem(&sdcardfs_packages_subsys); + configfs_unregister_subsystem(&sdcardfs_packages); } int packagelist_init(void) From 09c77d6230066003be095f1ac315622ec17ee355 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Sun, 22 Jan 2017 15:32:49 -0800 Subject: [PATCH 0637/3220] ANDROID: sdcardfs: add support for user permission isolation This allows you to hide the existence of a package from a user by adding them to an exclude list. If a user creates that package's folder and is on the exclude list, they will not see that package's id. Bug: 34542611 Change-Id: I9eb82e0bf2457d7eb81ee56153b9c7d2f6646323 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/derived_perm.c | 32 ++-- fs/sdcardfs/packagelist.c | 297 ++++++++++++++++++++++++++++++++++--- fs/sdcardfs/sdcardfs.h | 17 ++- 3 files changed, 306 insertions(+), 40 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 0c1a91f70b40..8e3baee4a2d9 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -103,7 +103,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st case PERM_ANDROID_OBB: case PERM_ANDROID_MEDIA: appid = get_appid(newdentry->d_name.name); - if (appid != 0) { + if (appid != 0 && !is_excluded(newdentry->d_name.name, parent_info->userid)) { info->d_uid = multiuser_get_uid(parent_info->userid, appid); } set_top(info, &info->vfs_inode); @@ -116,8 +116,10 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry) get_derived_permission_new(parent, dentry, dentry); } -static int descendant_may_need_fixup(perm_t perm) { - if (perm == PERM_PRE_ROOT || perm == PERM_ROOT || perm == PERM_ANDROID) +static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit) { + if (info->perm == PERM_ROOT) + return (limit->flags & BY_USERID)?info->userid == limit->userid:1; + if (info->perm == PERM_PRE_ROOT || info->perm == PERM_ANDROID) return 1; return 0; } @@ -129,7 +131,7 @@ static int needs_fixup(perm_t perm) { return 0; } -void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len) { +void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit) { struct dentry *child; struct sdcardfs_inode_info *info; if (!dget(dentry)) @@ -143,22 +145,22 @@ void fixup_perms_recursive(struct dentry *dentry, const char* name, size_t len) if (needs_fixup(info->perm)) { spin_lock(&dentry->d_lock); list_for_each_entry(child, &dentry->d_subdirs, d_child) { - dget(child); - if (!strncasecmp(child->d_name.name, name, len)) { - if (d_inode(child)) { - get_derived_permission(dentry, child); - fixup_tmp_permissions(d_inode(child)); - dput(child); - break; - } + dget(child); + if (!(limit->flags & BY_NAME) || !strncasecmp(child->d_name.name, limit->name, limit->length)) { + if (d_inode(child)) { + get_derived_permission(dentry, child); + fixup_tmp_permissions(d_inode(child)); + dput(child); + break; } - dput(child); + } + dput(child); } spin_unlock(&dentry->d_lock); - } else if (descendant_may_need_fixup(info->perm)) { + } else if (descendant_may_need_fixup(info, limit)) { spin_lock(&dentry->d_lock); list_for_each_entry(child, &dentry->d_subdirs, d_child) { - fixup_perms_recursive(child, name, len); + fixup_perms_recursive(child, limit); } spin_unlock(&dentry->d_lock); } diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 0b3fb50b1fe4..6eb73ddc2ceb 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -31,11 +31,13 @@ struct hashtable_entry { struct hlist_node hlist; + struct hlist_node dlist; /* for deletion cleanup */ const char *key; atomic_t value; }; static DEFINE_HASHTABLE(package_to_appid, 8); +static DEFINE_HASHTABLE(package_to_userid, 8); static struct kmem_cache *hashtable_entry_cachep; @@ -69,6 +71,22 @@ appid_t get_appid(const char *app_name) return 0; } +appid_t is_excluded(const char *app_name, userid_t user) +{ + struct hashtable_entry *hash_cur; + unsigned int hash = str_hash(app_name); + + rcu_read_lock(); + hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { + if (atomic_read(&hash_cur->value) == user && !strcasecmp(app_name, hash_cur->key)) { + rcu_read_unlock(); + return 1; + } + } + rcu_read_unlock(); + return 0; +} + /* Kernel has already enforced everything we returned through * derive_permissions_locked(), so this is used to lock down access * even further, such as enforcing that apps hold sdcard_rw. */ @@ -124,7 +142,7 @@ static struct hashtable_entry *alloc_packagelist_entry(const char *key, return ret; } -static int insert_packagelist_entry_locked(const char *key, appid_t value) +static int insert_packagelist_appid_entry_locked(const char *key, appid_t value) { struct hashtable_entry *hash_cur; struct hashtable_entry *new_entry; @@ -143,18 +161,64 @@ static int insert_packagelist_entry_locked(const char *key, appid_t value) return 0; } -static void fixup_perms(struct super_block *sb, const char *key) { - if (sb && sb->s_magic == SDCARDFS_SUPER_MAGIC) { - fixup_perms_recursive(sb->s_root, key, strlen(key)); +static int insert_userid_exclude_entry_locked(const char *key, userid_t value) +{ + struct hashtable_entry *hash_cur; + struct hashtable_entry *new_entry; + unsigned int hash = str_hash(key); + + /* Only insert if not already present */ + hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { + if (atomic_read(&hash_cur->value) == value && !strcasecmp(key, hash_cur->key)) + return 0; + } + new_entry = alloc_packagelist_entry(key, value); + if (!new_entry) + return -ENOMEM; + hash_add_rcu(package_to_userid, &new_entry->hlist, hash); + return 0; +} + +static void fixup_all_perms_name(const char *key) +{ + struct sdcardfs_sb_info *sbinfo; + struct limit_search limit = { + .flags = BY_NAME, + .name = key, + .length = strlen(key), + }; + list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { + if (sbinfo_has_sdcard_magic(sbinfo)) + fixup_perms_recursive(sbinfo->sb->s_root, &limit); } } -static void fixup_all_perms(const char *key) +static void fixup_all_perms_name_userid(const char *key, userid_t userid) { struct sdcardfs_sb_info *sbinfo; - list_for_each_entry(sbinfo, &sdcardfs_super_list, list) - if (sbinfo) - fixup_perms(sbinfo->sb, key); + struct limit_search limit = { + .flags = BY_NAME | BY_USERID, + .name = key, + .length = strlen(key), + .userid = userid, + }; + list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { + if (sbinfo_has_sdcard_magic(sbinfo)) + fixup_perms_recursive(sbinfo->sb->s_root, &limit); + } +} + +static void fixup_all_perms_userid(userid_t userid) +{ + struct sdcardfs_sb_info *sbinfo; + struct limit_search limit = { + .flags = BY_USERID, + .userid = userid, + }; + list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { + if (sbinfo_has_sdcard_magic(sbinfo)) + fixup_perms_recursive(sbinfo->sb->s_root, &limit); + } } static int insert_packagelist_entry(const char *key, appid_t value) @@ -162,9 +226,22 @@ static int insert_packagelist_entry(const char *key, appid_t value) int err; mutex_lock(&sdcardfs_super_list_lock); - err = insert_packagelist_entry_locked(key, value); + err = insert_packagelist_appid_entry_locked(key, value); if (!err) - fixup_all_perms(key); + fixup_all_perms_name(key); + mutex_unlock(&sdcardfs_super_list_lock); + + return err; +} + +static int insert_userid_exclude_entry(const char *key, userid_t value) +{ + int err; + + mutex_lock(&sdcardfs_super_list_lock); + err = insert_userid_exclude_entry_locked(key, value); + if (!err) + fixup_all_perms_name_userid(key, value); mutex_unlock(&sdcardfs_super_list_lock); return err; @@ -173,7 +250,7 @@ static int insert_packagelist_entry(const char *key, appid_t value) static void free_packagelist_entry(struct hashtable_entry *entry) { kfree(entry->key); - hash_del_rcu(&entry->hlist); + hash_del_rcu(&entry->dlist); kmem_cache_free(hashtable_entry_cachep, entry); } @@ -181,22 +258,84 @@ static void remove_packagelist_entry_locked(const char *key) { struct hashtable_entry *hash_cur; unsigned int hash = str_hash(key); + struct hlist_node *h_t; + HLIST_HEAD(free_list); + hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { + if (!strcasecmp(key, hash_cur->key)) { + hash_del_rcu(&hash_cur->hlist); + hlist_add_head(&hash_cur->dlist, &free_list); + } + } hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { if (!strcasecmp(key, hash_cur->key)) { hash_del_rcu(&hash_cur->hlist); - synchronize_rcu(); - free_packagelist_entry(hash_cur); - return; + hlist_add_head(&hash_cur->dlist, &free_list); + break; } } + synchronize_rcu(); + hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) + free_packagelist_entry(hash_cur); } static void remove_packagelist_entry(const char *key) { mutex_lock(&sdcardfs_super_list_lock); remove_packagelist_entry_locked(key); - fixup_all_perms(key); + fixup_all_perms_name(key); + mutex_unlock(&sdcardfs_super_list_lock); + return; +} + +static void remove_userid_all_entry_locked(userid_t userid) +{ + struct hashtable_entry *hash_cur; + struct hlist_node *h_t; + HLIST_HEAD(free_list); + int i; + + hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) { + if (atomic_read(&hash_cur->value) == userid) { + hash_del_rcu(&hash_cur->hlist); + hlist_add_head(&hash_cur->dlist, &free_list); + } + } + synchronize_rcu(); + hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) { + free_packagelist_entry(hash_cur); + } +} + +static void remove_userid_all_entry(userid_t userid) +{ + mutex_lock(&sdcardfs_super_list_lock); + remove_userid_all_entry_locked(userid); + fixup_all_perms_userid(userid); + mutex_unlock(&sdcardfs_super_list_lock); + return; +} + +static void remove_userid_exclude_entry_locked(const char *key, userid_t userid) +{ + struct hashtable_entry *hash_cur; + unsigned int hash = str_hash(key); + + hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { + if (!strcasecmp(key, hash_cur->key) && atomic_read(&hash_cur->value) == userid) { + hash_del_rcu(&hash_cur->hlist); + synchronize_rcu(); + free_packagelist_entry(hash_cur); + break; + } + } +} + +static void remove_userid_exclude_entry(const char *key, userid_t userid) +{ + mutex_lock(&sdcardfs_super_list_lock); + remove_userid_exclude_entry_locked(key, userid); + fixup_all_perms_name_userid(key, userid); mutex_unlock(&sdcardfs_super_list_lock); return; } @@ -210,16 +349,44 @@ static void packagelist_destroy(void) mutex_lock(&sdcardfs_super_list_lock); hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) { hash_del_rcu(&hash_cur->hlist); - hlist_add_head(&hash_cur->hlist, &free_list); - + hlist_add_head(&hash_cur->dlist, &free_list); + } + hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) { + hash_del_rcu(&hash_cur->hlist); + hlist_add_head(&hash_cur->dlist, &free_list); } synchronize_rcu(); - hlist_for_each_entry_safe(hash_cur, h_t, &free_list, hlist) + hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) free_packagelist_entry(hash_cur); mutex_unlock(&sdcardfs_super_list_lock); printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n"); } +#define SDCARDFS_CONFIGFS_ATTR(_pfx, _name) \ +static struct configfs_attribute _pfx##attr_##_name = { \ + .ca_name = __stringify(_name), \ + .ca_mode = S_IRUGO | S_IWUGO, \ + .ca_owner = THIS_MODULE, \ + .show = _pfx##_name##_show, \ + .store = _pfx##_name##_store, \ +} + +#define SDCARDFS_CONFIGFS_ATTR_RO(_pfx, _name) \ +static struct configfs_attribute _pfx##attr_##_name = { \ + .ca_name = __stringify(_name), \ + .ca_mode = S_IRUGO, \ + .ca_owner = THIS_MODULE, \ + .show = _pfx##_name##_show, \ +} + +#define SDCARDFS_CONFIGFS_ATTR_WO(_pfx, _name) \ +static struct configfs_attribute _pfx##attr_##_name = { \ + .ca_name = __stringify(_name), \ + .ca_mode = S_IWUGO, \ + .ca_owner = THIS_MODULE, \ + .store = _pfx##_name##_store, \ +} + struct package_details { struct config_item item; const char *name; @@ -253,6 +420,58 @@ static ssize_t package_details_appid_store(struct config_item *item, return count; } +static ssize_t package_details_excluded_userids_show(struct config_item *item, + char *page) +{ + struct package_details *package_details = to_package_details(item); + struct hashtable_entry *hash_cur; + unsigned int hash = str_hash(package_details->name); + int count = 0; + + rcu_read_lock(); + hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { + if (!strcasecmp(package_details->name, hash_cur->key)) + count += scnprintf(page + count, PAGE_SIZE - count, + "%d ", atomic_read(&hash_cur->value)); + } + rcu_read_unlock(); + if (count) + count--; + count += scnprintf(page + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t package_details_excluded_userids_store(struct config_item *item, + const char *page, size_t count) +{ + unsigned int tmp; + int ret; + + ret = kstrtouint(page, 10, &tmp); + if (ret) + return ret; + + ret = insert_userid_exclude_entry(to_package_details(item)->name, tmp); + + if (ret) + return ret; + + return count; +} + +static ssize_t package_details_clear_userid_store(struct config_item *item, + const char *page, size_t count) +{ + unsigned int tmp; + int ret; + + ret = kstrtouint(page, 10, &tmp); + if (ret) + return ret; + remove_userid_exclude_entry(to_package_details(item)->name, tmp); + return count; +} + static void package_details_release(struct config_item *item) { struct package_details *package_details = to_package_details(item); @@ -262,10 +481,14 @@ static void package_details_release(struct config_item *item) kfree(package_details); } -CONFIGFS_ATTR(package_details_, appid); +SDCARDFS_CONFIGFS_ATTR(package_details_, appid); +SDCARDFS_CONFIGFS_ATTR(package_details_, excluded_userids); +SDCARDFS_CONFIGFS_ATTR_WO(package_details_, clear_userid); static struct configfs_attribute *package_details_attrs[] = { &package_details_attr_appid, + &package_details_attr_excluded_userids, + &package_details_attr_clear_userid, NULL, }; @@ -293,23 +516,33 @@ static struct config_item *packages_make_item(struct config_group *group, const } config_item_init_type_name(&package_details->item, name, - &package_appid_type); + &package_appid_type); return &package_details->item; } static ssize_t packages_list_show(struct config_item *item, char *page) { - struct hashtable_entry *hash_cur; + struct hashtable_entry *hash_cur_app; + struct hashtable_entry *hash_cur_user; int i; int count = 0, written = 0; const char errormsg[] = "\n"; + unsigned int hash; rcu_read_lock(); - hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) { + hash_for_each_rcu(package_to_appid, i, hash_cur_app, hlist) { written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n", - (const char *)hash_cur->key, atomic_read(&hash_cur->value)); - if (count + written == PAGE_SIZE - sizeof(errormsg)) { + hash_cur_app->key, atomic_read(&hash_cur_app->value)); + hash = str_hash(hash_cur_app->key); + hash_for_each_possible_rcu(package_to_userid, hash_cur_user, hlist, hash) { + if (!strcasecmp(hash_cur_app->key, hash_cur_user->key)) { + written += scnprintf(page + count + written - 1, + PAGE_SIZE - sizeof(errormsg) - count - written + 1, + " %d\n", atomic_read(&hash_cur_user->value)) - 1; + } + } + if (count + written == PAGE_SIZE - sizeof(errormsg) - 1) { count += scnprintf(page + count, PAGE_SIZE - count, errormsg); break; } @@ -320,6 +553,19 @@ static ssize_t packages_list_show(struct config_item *item, char *page) return count; } +static ssize_t packages_remove_userid_store(struct config_item *item, + const char *page, size_t count) +{ + unsigned int tmp; + int ret; + + ret = kstrtouint(page, 10, &tmp); + if (ret) + return ret; + remove_userid_all_entry(tmp); + return count; +} + static struct configfs_attribute packages_attr_packages_gid_list = { .ca_name = "packages_gid.list", .ca_mode = S_IRUGO, @@ -327,8 +573,11 @@ static struct configfs_attribute packages_attr_packages_gid_list = { .show = packages_list_show, }; +SDCARDFS_CONFIGFS_ATTR_WO(packages_, remove_userid); + static struct configfs_attribute *packages_attrs[] = { &packages_attr_packages_gid_list, + &packages_attr_remove_userid, NULL, }; diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index b03130329014..3434849cee63 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -335,6 +335,11 @@ static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \ SDCARDFS_DENT_FUNC(lower_path) SDCARDFS_DENT_FUNC(orig_path) +static inline bool sbinfo_has_sdcard_magic(struct sdcardfs_sb_info *sbinfo) +{ + return sbinfo && sbinfo->sb && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC; +} + /* grab a refererence if we aren't linking to ourself */ static inline void set_top(struct sdcardfs_inode_info *info, struct inode *top) { @@ -442,18 +447,28 @@ extern struct list_head sdcardfs_super_list; /* for packagelist.c */ extern appid_t get_appid(const char *app_name); +extern appid_t is_excluded(const char *app_name, userid_t userid); extern int check_caller_access_to_name(struct inode *parent_node, const char* name); extern int open_flags_to_access_mode(int open_flags); extern int packagelist_init(void); extern void packagelist_exit(void); /* for derived_perm.c */ +#define BY_NAME (1 << 0) +#define BY_USERID (1 << 1) +struct limit_search { + unsigned int flags; + const char *name; + size_t length; + userid_t userid; +}; + extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, uid_t uid, bool under_android, struct inode *top); extern void get_derived_permission(struct dentry *parent, struct dentry *dentry); extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry); extern void fixup_top_recursive(struct dentry *parent); -extern void fixup_perms_recursive(struct dentry *dentry, const char *name, size_t len); +extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit); extern void update_derived_permission_lock(struct dentry *dentry); extern int need_graft_path(struct dentry *dentry); From d46684aa425d950ed86d4e3c10367b2a5c0b355d Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 26 Jan 2017 20:10:34 -0800 Subject: [PATCH 0638/3220] ANDROID: sdcardfs: Remove redundant operation We call get_derived_permission_new unconditionally, so we don't need to call update_derived_permission_lock, which does the same thing. Change-Id: I0748100828c6af806da807241a33bf42be614935 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/inode.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 6f450e523675..a2cad0f76f14 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -466,7 +466,6 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dir_dentry = NULL; struct vfsmount *lower_mnt = NULL; struct dentry *trap = NULL; - struct dentry *new_parent = NULL; struct path lower_old_path, lower_new_path; const struct cred *saved_cred = NULL; @@ -516,17 +515,6 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir != old_dir) { sdcardfs_copy_and_fix_attrs(old_dir, d_inode(lower_old_dir_dentry)); fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry)); - - /* update the derived permission of the old_dentry - * with its new parent - */ - new_parent = dget_parent(new_dentry); - if(new_parent) { - if(d_inode(old_dentry)) { - update_derived_permission_lock(old_dentry); - } - dput(new_parent); - } } /* At this point, not all dentry information has been moved, so * we pass along new_dentry for the name.*/ From 461da83e67affea70e2f70a0231606ad3f29f5b7 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 25 Jan 2017 13:48:45 -0800 Subject: [PATCH 0639/3220] ANDROID: sdcardfs: Add GID Derivation to sdcardfs This changes sdcardfs to modify the user and group in the underlying filesystem depending on its usage. Ownership is set by Android user, and package, as well as if the file is under obb or cache. Other files can be labeled by extension. Those values are set via the configfs interace. To add an entry, mkdir -p [configfs root]/sdcardfs/extensions/[gid]/[ext] Bug: 34262585 Change-Id: I4e030ce84f094a678376349b1a96923e5076a0f4 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/derived_perm.c | 167 +++++++++++++++++++++++++-- fs/sdcardfs/file.c | 2 +- fs/sdcardfs/inode.c | 34 +++--- fs/sdcardfs/lookup.c | 3 +- fs/sdcardfs/multiuser.h | 28 +++-- fs/sdcardfs/packagelist.c | 228 ++++++++++++++++++++++++++++++++++--- fs/sdcardfs/sdcardfs.h | 25 +++- 7 files changed, 432 insertions(+), 55 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 8e3baee4a2d9..d2bff5ecdad0 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -30,6 +30,8 @@ static void inherit_derived_state(struct inode *parent, struct inode *child) ci->userid = pi->userid; ci->d_uid = pi->d_uid; ci->under_android = pi->under_android; + ci->under_cache = pi->under_cache; + ci->under_obb = pi->under_obb; set_top(ci, pi->top); } @@ -43,11 +45,13 @@ void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, info->userid = userid; info->d_uid = uid; info->under_android = under_android; + info->under_cache = false; + info->under_obb = false; set_top(info, top); } /* While renaming, there is a point where we want the path from dentry, but the name from newdentry */ -void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry) +void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const char *name) { struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry)); struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent)); @@ -57,26 +61,30 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st * the properties are maintained on its private fields * because the inode attributes will be modified with that of * its lower inode. - * The derived state will be updated on the last - * stage of each system call by fix_derived_permission(inode). + * These values are used by our custom permission call instead + * of using the inode permissions. */ inherit_derived_state(d_inode(parent), d_inode(dentry)); + /* Files don't get special labels */ + if (!S_ISDIR(d_inode(dentry)->i_mode)) + return; /* Derive custom permissions based on parent and current node */ switch (parent_info->perm) { case PERM_INHERIT: + case PERM_ANDROID_PACKAGE_CACHE: /* Already inherited above */ break; case PERM_PRE_ROOT: /* Legacy internal layout places users at top level */ info->perm = PERM_ROOT; - info->userid = simple_strtoul(newdentry->d_name.name, NULL, 10); + info->userid = simple_strtoul(name, NULL, 10); set_top(info, &info->vfs_inode); break; case PERM_ROOT: /* Assume masked off by default. */ - if (!strcasecmp(newdentry->d_name.name, "Android")) { + if (!strcasecmp(name, "Android")) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID; info->under_android = true; @@ -84,36 +92,152 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, st } break; case PERM_ANDROID: - if (!strcasecmp(newdentry->d_name.name, "data")) { + if (!strcasecmp(name, "data")) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_DATA; set_top(info, &info->vfs_inode); - } else if (!strcasecmp(newdentry->d_name.name, "obb")) { + } else if (!strcasecmp(name, "obb")) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_OBB; + info->under_obb = true; set_top(info, &info->vfs_inode); /* Single OBB directory is always shared */ - } else if (!strcasecmp(newdentry->d_name.name, "media")) { + } else if (!strcasecmp(name, "media")) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_MEDIA; set_top(info, &info->vfs_inode); } break; - case PERM_ANDROID_DATA: case PERM_ANDROID_OBB: + case PERM_ANDROID_DATA: case PERM_ANDROID_MEDIA: - appid = get_appid(newdentry->d_name.name); - if (appid != 0 && !is_excluded(newdentry->d_name.name, parent_info->userid)) { + info->perm = PERM_ANDROID_PACKAGE; + appid = get_appid(name); + if (appid != 0 && !is_excluded(name, parent_info->userid)) { info->d_uid = multiuser_get_uid(parent_info->userid, appid); } set_top(info, &info->vfs_inode); break; + case PERM_ANDROID_PACKAGE: + if (!strcasecmp(name, "cache")) { + info->perm = PERM_ANDROID_PACKAGE_CACHE; + info->under_cache = true; + } + break; } } void get_derived_permission(struct dentry *parent, struct dentry *dentry) { - get_derived_permission_new(parent, dentry, dentry); + get_derived_permission_new(parent, dentry, dentry->d_name.name); +} + +static appid_t get_type(const char *name) { + const char *ext = strrchr(name, '.'); + appid_t id; + + if (ext && ext[0]) { + ext = &ext[1]; + id = get_ext_gid(ext); + return id?:AID_MEDIA_RW; + } + return AID_MEDIA_RW; +} + +void fixup_lower_ownership(struct dentry* dentry, const char *name) { + struct path path; + struct inode *inode; + struct inode *delegated_inode = NULL; + int error; + struct sdcardfs_inode_info *info; + struct sdcardfs_inode_info *info_top; + perm_t perm; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + uid_t uid = sbi->options.fs_low_uid; + gid_t gid = sbi->options.fs_low_gid; + struct iattr newattrs; + + info = SDCARDFS_I(d_inode(dentry)); + perm = info->perm; + if (info->under_obb) { + perm = PERM_ANDROID_OBB; + } else if (info->under_cache) { + perm = PERM_ANDROID_PACKAGE_CACHE; + } else if (perm == PERM_INHERIT) { + info_top = SDCARDFS_I(grab_top(info)); + perm = info_top->perm; + release_top(info); + } + + switch (perm) { + case PERM_ROOT: + case PERM_ANDROID: + case PERM_ANDROID_DATA: + case PERM_ANDROID_MEDIA: + case PERM_ANDROID_PACKAGE: + case PERM_ANDROID_PACKAGE_CACHE: + uid = multiuser_get_uid(info->userid, uid); + break; + case PERM_ANDROID_OBB: + uid = AID_MEDIA_OBB; + break; + case PERM_PRE_ROOT: + default: + break; + } + switch (perm) { + case PERM_ROOT: + case PERM_ANDROID: + case PERM_ANDROID_DATA: + case PERM_ANDROID_MEDIA: + if (S_ISDIR(d_inode(dentry)->i_mode)) + gid = multiuser_get_uid(info->userid, AID_MEDIA_RW); + else + gid = multiuser_get_uid(info->userid, get_type(name)); + break; + case PERM_ANDROID_OBB: + gid = AID_MEDIA_OBB; + break; + case PERM_ANDROID_PACKAGE: + if (info->d_uid != 0) + gid = multiuser_get_ext_gid(info->userid, info->d_uid); + else + gid = multiuser_get_uid(info->userid, uid); + break; + case PERM_ANDROID_PACKAGE_CACHE: + if (info->d_uid != 0) + gid = multiuser_get_cache_gid(info->userid, info->d_uid); + else + gid = multiuser_get_uid(info->userid, uid); + break; + case PERM_PRE_ROOT: + default: + break; + } + + sdcardfs_get_lower_path(dentry, &path); + inode = d_inode(path.dentry); + if (d_inode(path.dentry)->i_gid.val != gid || d_inode(path.dentry)->i_uid.val != uid) { +retry_deleg: + newattrs.ia_valid = ATTR_GID | ATTR_UID | ATTR_FORCE; + newattrs.ia_uid = make_kuid(current_user_ns(), uid); + newattrs.ia_gid = make_kgid(current_user_ns(), gid); + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= + ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; + mutex_lock(&inode->i_mutex); + error = security_path_chown(&path, newattrs.ia_uid, newattrs.ia_gid); + if (!error) + error = notify_change2(path.mnt, path.dentry, &newattrs, &delegated_inode); + mutex_unlock(&inode->i_mutex); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + if (error) + pr_err("sdcardfs: Failed to touch up lower fs gid/uid.\n"); + } } static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit) { @@ -167,9 +291,28 @@ void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit) { dput(dentry); } +void drop_recursive(struct dentry *parent) { + struct dentry *dentry; + struct sdcardfs_inode_info *info; + if (!d_inode(parent)) + return; + info = SDCARDFS_I(d_inode(parent)); + spin_lock(&parent->d_lock); + list_for_each_entry(dentry, &parent->d_subdirs, d_child) { + if (d_inode(dentry)) { + if (SDCARDFS_I(d_inode(parent))->top != SDCARDFS_I(d_inode(dentry))->top) { + drop_recursive(dentry); + d_drop(dentry); + } + } + } + spin_unlock(&parent->d_lock); +} + void fixup_top_recursive(struct dentry *parent) { struct dentry *dentry; struct sdcardfs_inode_info *info; + if (!d_inode(parent)) return; info = SDCARDFS_I(d_inode(parent)); diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 7750a0472389..006c6ff57ad7 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -225,7 +225,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file) } /* save current_cred and override it */ - OVERRIDE_CRED(sbi, saved_cred); + OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(inode)); file->private_data = kzalloc(sizeof(struct sdcardfs_file_info), GFP_KERNEL); diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index a2cad0f76f14..cb0588691a0f 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -22,16 +22,21 @@ #include /* Do not directly use this function. Use OVERRIDE_CRED() instead. */ -const struct cred * override_fsids(struct sdcardfs_sb_info* sbi) +const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs_inode_info *info) { struct cred * cred; const struct cred * old_cred; + uid_t uid; cred = prepare_creds(); if (!cred) return NULL; - cred->fsuid = make_kuid(&init_user_ns, sbi->options.fs_low_uid); + if (info->under_obb) + uid = AID_MEDIA_OBB; + else + uid = multiuser_get_uid(info->userid, sbi->options.fs_low_uid); + cred->fsuid = make_kuid(&init_user_ns, uid); cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid); old_cred = override_creds(cred); @@ -70,7 +75,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, } /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred); + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; @@ -98,6 +103,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, goto out; fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry)); + fixup_lower_ownership(dentry, dentry->d_name.name); out: current->fs = saved_fs; @@ -171,7 +177,7 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) } /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred); + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; @@ -279,7 +285,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode } /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred); + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); /* check disk space */ if (!check_min_free_space(dentry, 0, 1)) { @@ -343,9 +349,8 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry)); /* update number of links on parent directory */ set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink); - + fixup_lower_ownership(dentry, dentry->d_name.name); unlock_dir(lower_parent_dentry); - if ((!sbi->options.multiuser) && (!strcasecmp(dentry->d_name.name, "obb")) && (pi->perm == PERM_ANDROID) && (pi->userid == 0)) make_nomedia_in_obb = 1; @@ -353,6 +358,8 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode /* When creating /Android/data and /Android/obb, mark them as .nomedia */ if (make_nomedia_in_obb || ((pi->perm == PERM_ANDROID) && (!strcasecmp(dentry->d_name.name, "data")))) { + REVERT_CRED(saved_cred); + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(d_inode(dentry))); set_fs_pwd(current->fs, &lower_path); touch_err = touch(".nomedia", 0664); if (touch_err) { @@ -390,7 +397,7 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) } /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred); + OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); /* sdcardfs_get_real_lower(): in case of remove an user's obb dentry * the dentry on the original path should be deleted. */ @@ -479,7 +486,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, } /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(old_dir->i_sb), saved_cred); + OVERRIDE_CRED(SDCARDFS_SB(old_dir->i_sb), saved_cred, SDCARDFS_I(new_dir)); sdcardfs_get_real_lower(old_dentry, &lower_old_path); sdcardfs_get_lower_path(new_dentry, &lower_new_path); @@ -516,11 +523,10 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, sdcardfs_copy_and_fix_attrs(old_dir, d_inode(lower_old_dir_dentry)); fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry)); } - /* At this point, not all dentry information has been moved, so - * we pass along new_dentry for the name.*/ - get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry); + get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry->d_name.name); fixup_tmp_permissions(d_inode(old_dentry)); - fixup_top_recursive(old_dentry); + fixup_lower_ownership(old_dentry, new_dentry->d_name.name); + drop_recursive(old_dentry); /* Can't fixup ownership recursively :( */ out: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_old_dir_dentry); @@ -750,7 +756,7 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct goto out_err; /* save current_cred and override it */ - OVERRIDE_CRED(SDCARDFS_SB(dentry->d_sb), saved_cred); + OVERRIDE_CRED(SDCARDFS_SB(dentry->d_sb), saved_cred, SDCARDFS_I(inode)); sdcardfs_get_lower_path(dentry, &lower_path); lower_dentry = lower_path.dentry; diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index e94a65c8bbbd..3c9454e5e1c6 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -368,7 +368,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, } /* save current_cred and override it */ - OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred); + OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); sdcardfs_get_lower_path(parent, &lower_parent_path); @@ -392,6 +392,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, /* get derived permission */ get_derived_permission(parent, dentry); fixup_tmp_permissions(d_inode(dentry)); + fixup_lower_ownership(dentry, dentry->d_name.name); } /* update parent directory's atime */ fsstack_copy_attr_atime(d_inode(parent), diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h index 923ba101dfa9..52bc20080904 100644 --- a/fs/sdcardfs/multiuser.h +++ b/fs/sdcardfs/multiuser.h @@ -18,20 +18,32 @@ * General Public License. */ -#define MULTIUSER_APP_PER_USER_RANGE 100000 +#define AID_USER_OFFSET 100000 /* offset for uid ranges for each user */ +#define AID_APP_START 10000 /* first app user */ +#define AID_APP_END 19999 /* last app user */ +#define AID_CACHE_GID_START 20000 /* start of gids for apps to mark cached data */ +#define AID_EXT_GID_START 30000 /* start of gids for apps to mark external data */ +#define AID_SHARED_GID_START 50000 /* start of gids for apps in each user to share */ typedef uid_t userid_t; typedef uid_t appid_t; -static inline userid_t multiuser_get_user_id(uid_t uid) { - return uid / MULTIUSER_APP_PER_USER_RANGE; +static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id) { + return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET); } -static inline appid_t multiuser_get_app_id(uid_t uid) { - return uid % MULTIUSER_APP_PER_USER_RANGE; +static inline gid_t multiuser_get_cache_gid(userid_t user_id, appid_t app_id) { + if (app_id >= AID_APP_START && app_id <= AID_APP_END) { + return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_CACHE_GID_START); + } else { + return -1; + } } -static inline uid_t multiuser_get_uid(userid_t userId, appid_t appId) { - return userId * MULTIUSER_APP_PER_USER_RANGE + (appId % MULTIUSER_APP_PER_USER_RANGE); +static inline gid_t multiuser_get_ext_gid(userid_t user_id, appid_t app_id) { + if (app_id >= AID_APP_START && app_id <= AID_APP_END) { + return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_EXT_GID_START); + } else { + return -1; + } } - diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 6eb73ddc2ceb..cdab1967317b 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -21,6 +21,7 @@ #include "sdcardfs.h" #include #include +#include #include @@ -38,6 +39,8 @@ struct hashtable_entry { static DEFINE_HASHTABLE(package_to_appid, 8); static DEFINE_HASHTABLE(package_to_userid, 8); +static DEFINE_HASHTABLE(ext_to_groupid, 8); + static struct kmem_cache *hashtable_entry_cachep; @@ -53,15 +56,33 @@ static unsigned int str_hash(const char *key) { return h; } -appid_t get_appid(const char *app_name) +appid_t get_appid(const char *key) { struct hashtable_entry *hash_cur; - unsigned int hash = str_hash(app_name); + unsigned int hash = str_hash(key); appid_t ret_id; rcu_read_lock(); hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { - if (!strcasecmp(app_name, hash_cur->key)) { + if (!strcasecmp(key, hash_cur->key)) { + ret_id = atomic_read(&hash_cur->value); + rcu_read_unlock(); + return ret_id; + } + } + rcu_read_unlock(); + return 0; +} + +appid_t get_ext_gid(const char *key) +{ + struct hashtable_entry *hash_cur; + unsigned int hash = str_hash(key); + appid_t ret_id; + + rcu_read_lock(); + hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) { + if (!strcasecmp(key, hash_cur->key)) { ret_id = atomic_read(&hash_cur->value); rcu_read_unlock(); return ret_id; @@ -124,7 +145,7 @@ int open_flags_to_access_mode(int open_flags) { } } -static struct hashtable_entry *alloc_packagelist_entry(const char *key, +static struct hashtable_entry *alloc_hashtable_entry(const char *key, appid_t value) { struct hashtable_entry *ret = kmem_cache_alloc(hashtable_entry_cachep, @@ -154,13 +175,31 @@ static int insert_packagelist_appid_entry_locked(const char *key, appid_t value) return 0; } } - new_entry = alloc_packagelist_entry(key, value); + new_entry = alloc_hashtable_entry(key, value); if (!new_entry) return -ENOMEM; hash_add_rcu(package_to_appid, &new_entry->hlist, hash); return 0; } +static int insert_ext_gid_entry_locked(const char *key, appid_t value) +{ + struct hashtable_entry *hash_cur; + struct hashtable_entry *new_entry; + unsigned int hash = str_hash(key); + + /* An extension can only belong to one gid */ + hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) { + if (!strcasecmp(key, hash_cur->key)) + return -EINVAL; + } + new_entry = alloc_hashtable_entry(key, value); + if (!new_entry) + return -ENOMEM; + hash_add_rcu(ext_to_groupid, &new_entry->hlist, hash); + return 0; +} + static int insert_userid_exclude_entry_locked(const char *key, userid_t value) { struct hashtable_entry *hash_cur; @@ -172,7 +211,7 @@ static int insert_userid_exclude_entry_locked(const char *key, userid_t value) if (atomic_read(&hash_cur->value) == value && !strcasecmp(key, hash_cur->key)) return 0; } - new_entry = alloc_packagelist_entry(key, value); + new_entry = alloc_hashtable_entry(key, value); if (!new_entry) return -ENOMEM; hash_add_rcu(package_to_userid, &new_entry->hlist, hash); @@ -234,6 +273,17 @@ static int insert_packagelist_entry(const char *key, appid_t value) return err; } +static int insert_ext_gid_entry(const char *key, appid_t value) +{ + int err; + + mutex_lock(&sdcardfs_super_list_lock); + err = insert_ext_gid_entry_locked(key, value); + mutex_unlock(&sdcardfs_super_list_lock); + + return err; +} + static int insert_userid_exclude_entry(const char *key, userid_t value) { int err; @@ -247,7 +297,7 @@ static int insert_userid_exclude_entry(const char *key, userid_t value) return err; } -static void free_packagelist_entry(struct hashtable_entry *entry) +static void free_hashtable_entry(struct hashtable_entry *entry) { kfree(entry->key); hash_del_rcu(&entry->dlist); @@ -276,7 +326,7 @@ static void remove_packagelist_entry_locked(const char *key) } synchronize_rcu(); hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) - free_packagelist_entry(hash_cur); + free_hashtable_entry(hash_cur); } static void remove_packagelist_entry(const char *key) @@ -288,6 +338,29 @@ static void remove_packagelist_entry(const char *key) return; } +static void remove_ext_gid_entry_locked(const char *key, gid_t group) +{ + struct hashtable_entry *hash_cur; + unsigned int hash = str_hash(key); + + hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) { + if (!strcasecmp(key, hash_cur->key) && atomic_read(&hash_cur->value) == group) { + hash_del_rcu(&hash_cur->hlist); + synchronize_rcu(); + free_hashtable_entry(hash_cur); + break; + } + } +} + +static void remove_ext_gid_entry(const char *key, gid_t group) +{ + mutex_lock(&sdcardfs_super_list_lock); + remove_ext_gid_entry_locked(key, group); + mutex_unlock(&sdcardfs_super_list_lock); + return; +} + static void remove_userid_all_entry_locked(userid_t userid) { struct hashtable_entry *hash_cur; @@ -303,7 +376,7 @@ static void remove_userid_all_entry_locked(userid_t userid) } synchronize_rcu(); hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) { - free_packagelist_entry(hash_cur); + free_hashtable_entry(hash_cur); } } @@ -325,7 +398,7 @@ static void remove_userid_exclude_entry_locked(const char *key, userid_t userid) if (!strcasecmp(key, hash_cur->key) && atomic_read(&hash_cur->value) == userid) { hash_del_rcu(&hash_cur->hlist); synchronize_rcu(); - free_packagelist_entry(hash_cur); + free_hashtable_entry(hash_cur); break; } } @@ -357,7 +430,7 @@ static void packagelist_destroy(void) } synchronize_rcu(); hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) - free_packagelist_entry(hash_cur); + free_hashtable_entry(hash_cur); mutex_unlock(&sdcardfs_super_list_lock); printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n"); } @@ -502,6 +575,127 @@ static struct config_item_type package_appid_type = { .ct_owner = THIS_MODULE, }; +struct extensions_value { + struct config_group group; + unsigned int num; +}; + +struct extension_details { + struct config_item item; + const char* name; + unsigned int num; +}; + +static inline struct extensions_value *to_extensions_value(struct config_item *item) +{ + return item ? container_of(to_config_group(item), struct extensions_value, group) : NULL; +} + +static inline struct extension_details *to_extension_details(struct config_item *item) +{ + return item ? container_of(item, struct extension_details, item) : NULL; +} + +static void extension_details_release(struct config_item *item) +{ + struct extension_details *extension_details = to_extension_details(item); + + printk(KERN_INFO "sdcardfs: No longer mapping %s files to gid %d\n", + extension_details->name, extension_details->num); + remove_ext_gid_entry(extension_details->name, extension_details->num); + kfree(extension_details->name); + kfree(extension_details); +} + +static struct configfs_item_operations extension_details_item_ops = { + .release = extension_details_release, +}; + +static struct config_item_type extension_details_type = { + .ct_item_ops = &extension_details_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item *extension_details_make_item(struct config_group *group, const char *name) +{ + struct extensions_value *extensions_value = to_extensions_value(&group->cg_item); + struct extension_details *extension_details = kzalloc(sizeof(struct extension_details), GFP_KERNEL); + int ret; + if (!extension_details) + return ERR_PTR(-ENOMEM); + + extension_details->name = kstrdup(name, GFP_KERNEL); + if (!extension_details->name) { + kfree(extension_details); + return ERR_PTR(-ENOMEM); + } + extension_details->num = extensions_value->num; + ret = insert_ext_gid_entry(name, extensions_value->num); + + if (ret) { + kfree(extension_details->name); + kfree(extension_details); + return ERR_PTR(ret); + } + config_item_init_type_name(&extension_details->item, name, &extension_details_type); + + return &extension_details->item; +} + +static struct configfs_group_operations extensions_value_group_ops = { + .make_item = extension_details_make_item, +}; + +static struct config_item_type extensions_name_type = { + .ct_group_ops = &extensions_value_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *extensions_make_group(struct config_group *group, const char *name) +{ + struct extensions_value *extensions_value; + unsigned int tmp; + int ret; + + extensions_value = kzalloc(sizeof(struct extensions_value), GFP_KERNEL); + if (!extensions_value) + return ERR_PTR(-ENOMEM); + ret = kstrtouint(name, 10, &tmp); + if (ret) { + kfree(extensions_value); + return ERR_PTR(ret); + } + + extensions_value->num = tmp; + config_group_init_type_name(&extensions_value->group, name, + &extensions_name_type); + return &extensions_value->group; +} + +static void extensions_drop_group(struct config_group *group, struct config_item *item) +{ + struct extensions_value *value = to_extensions_value(item); + printk(KERN_INFO "sdcardfs: No longer mapping any files to gid %d\n", value->num); + kfree(value); +} + +static struct configfs_group_operations extensions_group_ops = { + .make_group = extensions_make_group, + .drop_item = extensions_drop_group, +}; + +static struct config_item_type extensions_type = { + .ct_group_ops = &extensions_group_ops, + .ct_owner = THIS_MODULE, +}; + +struct config_group extension_group = { + .cg_item = { + .ci_namebuf = "extensions", + .ci_type = &extensions_type, + }, +}; + static struct config_item *packages_make_item(struct config_group *group, const char *name) { struct package_details *package_details; @@ -595,20 +789,28 @@ static struct config_item_type packages_type = { .ct_owner = THIS_MODULE, }; +struct config_group *sd_default_groups[] = { + &extension_group, + NULL, +}; + static struct configfs_subsystem sdcardfs_packages = { .su_group = { .cg_item = { .ci_namebuf = "sdcardfs", .ci_type = &packages_type, }, + .default_groups = sd_default_groups, }, }; static int configfs_sdcardfs_init(void) { - int ret; + int ret, i; struct configfs_subsystem *subsys = &sdcardfs_packages; - + for (i = 0; sd_default_groups[i]; i++) { + config_group_init(sd_default_groups[i]); + } config_group_init(&subsys->su_group); mutex_init(&subsys->su_mutex); ret = configfs_register_subsystem(subsys); diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 3434849cee63..03da961e3b09 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -65,6 +65,9 @@ #define AID_SDCARD_PICS 1033 /* external storage photos access */ #define AID_SDCARD_AV 1034 /* external storage audio/video access */ #define AID_SDCARD_ALL 1035 /* access all users external storage */ +#define AID_MEDIA_OBB 1059 /* obb files */ + +#define AID_SDCARD_IMAGE 1057 #define AID_PACKAGE_INFO 1027 @@ -91,12 +94,12 @@ * These two macro should be used in pair, and OVERRIDE_CRED() should be * placed at the beginning of a function, right after variable declaration. */ -#define OVERRIDE_CRED(sdcardfs_sbi, saved_cred) \ - saved_cred = override_fsids(sdcardfs_sbi); \ +#define OVERRIDE_CRED(sdcardfs_sbi, saved_cred, info) \ + saved_cred = override_fsids(sdcardfs_sbi, info); \ if (!saved_cred) { return -ENOMEM; } -#define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred) \ - saved_cred = override_fsids(sdcardfs_sbi); \ +#define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred, info) \ + saved_cred = override_fsids(sdcardfs_sbi, info); \ if (!saved_cred) { return ERR_PTR(-ENOMEM); } #define REVERT_CRED(saved_cred) revert_fsids(saved_cred) @@ -127,13 +130,18 @@ typedef enum { PERM_ANDROID_OBB, /* This node is "/Android/media" */ PERM_ANDROID_MEDIA, + /* This node is "/Android/[data|media|obb]/[package]" */ + PERM_ANDROID_PACKAGE, + /* This node is "/Android/[data|media|obb]/[package]/cache" */ + PERM_ANDROID_PACKAGE_CACHE, } perm_t; struct sdcardfs_sb_info; struct sdcardfs_mount_options; +struct sdcardfs_inode_info; /* Do not directly use this function. Use OVERRIDE_CRED() instead. */ -const struct cred * override_fsids(struct sdcardfs_sb_info* sbi); +const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs_inode_info *info); /* Do not directly use this function, use REVERT_CRED() instead. */ void revert_fsids(const struct cred * old_cred); @@ -175,6 +183,8 @@ struct sdcardfs_inode_info { userid_t userid; uid_t d_uid; bool under_android; + bool under_cache; + bool under_obb; /* top folder for ownership */ struct inode *top; @@ -447,6 +457,7 @@ extern struct list_head sdcardfs_super_list; /* for packagelist.c */ extern appid_t get_appid(const char *app_name); +extern appid_t get_ext_gid(const char *app_name); extern appid_t is_excluded(const char *app_name, userid_t userid); extern int check_caller_access_to_name(struct inode *parent_node, const char* name); extern int open_flags_to_access_mode(int open_flags); @@ -466,11 +477,13 @@ struct limit_search { extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, uid_t uid, bool under_android, struct inode *top); extern void get_derived_permission(struct dentry *parent, struct dentry *dentry); -extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct dentry *newdentry); +extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const char *name); +extern void drop_recursive(struct dentry *parent); extern void fixup_top_recursive(struct dentry *parent); extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit); extern void update_derived_permission_lock(struct dentry *dentry); +void fixup_lower_ownership(struct dentry* dentry, const char *name); extern int need_graft_path(struct dentry *dentry); extern int is_base_obbpath(struct dentry *dentry); extern int is_obbpath_invalid(struct dentry *dentry); From 7191add87cd90ac136df580204b008b33a913347 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 27 Jan 2017 19:35:08 -0800 Subject: [PATCH 0640/3220] ANDROID: sdcardfs: switch to full_name_hash and qstr Use the kernel's string hash function instead of rolling our own. Additionally, save a bit of calculation by using the qstr struct in place of strings. Change-Id: I0bbeb5ec2a9233f40135ad632e6f22c30ffa95c1 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/packagelist.c | 215 +++++++++++++++++++++----------------- 1 file changed, 121 insertions(+), 94 deletions(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index cdab1967317b..b02feef08d51 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -22,7 +22,7 @@ #include #include #include - +#include #include #include @@ -33,7 +33,7 @@ struct hashtable_entry { struct hlist_node hlist; struct hlist_node dlist; /* for deletion cleanup */ - const char *key; + struct qstr key; atomic_t value; }; @@ -44,27 +44,53 @@ static DEFINE_HASHTABLE(ext_to_groupid, 8); static struct kmem_cache *hashtable_entry_cachep; -static unsigned int str_hash(const char *key) { - int i; - unsigned int h = strlen(key); - char *data = (char *)key; - - for (i = 0; i < strlen(key); i++) { - h = h * 31 + *data; - data++; - } - return h; +static void inline qstr_init(struct qstr *q, const char *name) { + q->name = name; + q->len = strlen(q->name); + q->hash = full_name_hash(q->name, q->len); } -appid_t get_appid(const char *key) +static inline int qstr_copy(const struct qstr *src, struct qstr *dest) { + dest->name = kstrdup(src->name, GFP_KERNEL); + dest->hash_len = src->hash_len; + return !!dest->name; +} + + +static appid_t __get_appid(const struct qstr *key) { struct hashtable_entry *hash_cur; - unsigned int hash = str_hash(key); + unsigned int hash = key->hash; appid_t ret_id; rcu_read_lock(); hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { - if (!strcasecmp(key, hash_cur->key)) { + if (!strcasecmp(key->name, hash_cur->key.name)) { + ret_id = atomic_read(&hash_cur->value); + rcu_read_unlock(); + return ret_id; + } + } + rcu_read_unlock(); + return 0; +} + +appid_t get_appid(const char *key) +{ + struct qstr q; + qstr_init(&q, key); + return __get_appid(&q); +} + +static appid_t __get_ext_gid(const struct qstr *key) +{ + struct hashtable_entry *hash_cur; + unsigned int hash = key->hash; + appid_t ret_id; + + rcu_read_lock(); + hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) { + if (!strcasecmp(key->name, hash_cur->key.name)) { ret_id = atomic_read(&hash_cur->value); rcu_read_unlock(); return ret_id; @@ -75,17 +101,23 @@ appid_t get_appid(const char *key) } appid_t get_ext_gid(const char *key) +{ + struct qstr q; + qstr_init(&q, key); + return __get_ext_gid(&q); +} + +static appid_t __is_excluded(const struct qstr *app_name, userid_t user) { struct hashtable_entry *hash_cur; - unsigned int hash = str_hash(key); - appid_t ret_id; + unsigned int hash = app_name->hash; rcu_read_lock(); - hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) { - if (!strcasecmp(key, hash_cur->key)) { - ret_id = atomic_read(&hash_cur->value); + hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { + if (atomic_read(&hash_cur->value) == user && + !strcasecmp(app_name->name, hash_cur->key.name)) { rcu_read_unlock(); - return ret_id; + return 1; } } rcu_read_unlock(); @@ -94,20 +126,12 @@ appid_t get_ext_gid(const char *key) appid_t is_excluded(const char *app_name, userid_t user) { - struct hashtable_entry *hash_cur; - unsigned int hash = str_hash(app_name); - - rcu_read_lock(); - hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { - if (atomic_read(&hash_cur->value) == user && !strcasecmp(app_name, hash_cur->key)) { - rcu_read_unlock(); - return 1; - } - } - rcu_read_unlock(); - return 0; + struct qstr q; + qstr_init(&q, app_name); + return __is_excluded(&q, user); } + /* Kernel has already enforced everything we returned through * derive_permissions_locked(), so this is used to lock down access * even further, such as enforcing that apps hold sdcard_rw. */ @@ -145,7 +169,7 @@ int open_flags_to_access_mode(int open_flags) { } } -static struct hashtable_entry *alloc_hashtable_entry(const char *key, +static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key, appid_t value) { struct hashtable_entry *ret = kmem_cache_alloc(hashtable_entry_cachep, @@ -153,8 +177,7 @@ static struct hashtable_entry *alloc_hashtable_entry(const char *key, if (!ret) return NULL; - ret->key = kstrdup(key, GFP_KERNEL); - if (!ret->key) { + if (!qstr_copy(key, &ret->key)) { kmem_cache_free(hashtable_entry_cachep, ret); return NULL; } @@ -163,14 +186,14 @@ static struct hashtable_entry *alloc_hashtable_entry(const char *key, return ret; } -static int insert_packagelist_appid_entry_locked(const char *key, appid_t value) +static int insert_packagelist_appid_entry_locked(const struct qstr *key, appid_t value) { struct hashtable_entry *hash_cur; struct hashtable_entry *new_entry; - unsigned int hash = str_hash(key); + unsigned int hash = key->hash; hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { - if (!strcasecmp(key, hash_cur->key)) { + if (!strcasecmp(key->name, hash_cur->key.name)) { atomic_set(&hash_cur->value, value); return 0; } @@ -182,15 +205,15 @@ static int insert_packagelist_appid_entry_locked(const char *key, appid_t value) return 0; } -static int insert_ext_gid_entry_locked(const char *key, appid_t value) +static int insert_ext_gid_entry_locked(const struct qstr *key, appid_t value) { struct hashtable_entry *hash_cur; struct hashtable_entry *new_entry; - unsigned int hash = str_hash(key); + unsigned int hash = key->hash; /* An extension can only belong to one gid */ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) { - if (!strcasecmp(key, hash_cur->key)) + if (!strcasecmp(key->name, hash_cur->key.name)) return -EINVAL; } new_entry = alloc_hashtable_entry(key, value); @@ -200,15 +223,16 @@ static int insert_ext_gid_entry_locked(const char *key, appid_t value) return 0; } -static int insert_userid_exclude_entry_locked(const char *key, userid_t value) +static int insert_userid_exclude_entry_locked(const struct qstr *key, userid_t value) { struct hashtable_entry *hash_cur; struct hashtable_entry *new_entry; - unsigned int hash = str_hash(key); + unsigned int hash = key->hash; /* Only insert if not already present */ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { - if (atomic_read(&hash_cur->value) == value && !strcasecmp(key, hash_cur->key)) + if (atomic_read(&hash_cur->value) == value && + !strcasecmp(key->name, hash_cur->key.name)) return 0; } new_entry = alloc_hashtable_entry(key, value); @@ -218,13 +242,13 @@ static int insert_userid_exclude_entry_locked(const char *key, userid_t value) return 0; } -static void fixup_all_perms_name(const char *key) +static void fixup_all_perms_name(const struct qstr *key) { struct sdcardfs_sb_info *sbinfo; struct limit_search limit = { .flags = BY_NAME, - .name = key, - .length = strlen(key), + .name = key->name, + .length = key->len, }; list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { if (sbinfo_has_sdcard_magic(sbinfo)) @@ -232,13 +256,13 @@ static void fixup_all_perms_name(const char *key) } } -static void fixup_all_perms_name_userid(const char *key, userid_t userid) +static void fixup_all_perms_name_userid(const struct qstr *key, userid_t userid) { struct sdcardfs_sb_info *sbinfo; struct limit_search limit = { .flags = BY_NAME | BY_USERID, - .name = key, - .length = strlen(key), + .name = key->name, + .length = key->len, .userid = userid, }; list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { @@ -260,7 +284,7 @@ static void fixup_all_perms_userid(userid_t userid) } } -static int insert_packagelist_entry(const char *key, appid_t value) +static int insert_packagelist_entry(const struct qstr *key, appid_t value) { int err; @@ -273,7 +297,7 @@ static int insert_packagelist_entry(const char *key, appid_t value) return err; } -static int insert_ext_gid_entry(const char *key, appid_t value) +static int insert_ext_gid_entry(const struct qstr *key, appid_t value) { int err; @@ -284,7 +308,7 @@ static int insert_ext_gid_entry(const char *key, appid_t value) return err; } -static int insert_userid_exclude_entry(const char *key, userid_t value) +static int insert_userid_exclude_entry(const struct qstr *key, userid_t value) { int err; @@ -299,26 +323,26 @@ static int insert_userid_exclude_entry(const char *key, userid_t value) static void free_hashtable_entry(struct hashtable_entry *entry) { - kfree(entry->key); + kfree(entry->key.name); hash_del_rcu(&entry->dlist); kmem_cache_free(hashtable_entry_cachep, entry); } -static void remove_packagelist_entry_locked(const char *key) +static void remove_packagelist_entry_locked(const struct qstr *key) { struct hashtable_entry *hash_cur; - unsigned int hash = str_hash(key); + unsigned int hash = key->hash; struct hlist_node *h_t; HLIST_HEAD(free_list); hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { - if (!strcasecmp(key, hash_cur->key)) { + if (!strcasecmp(key->name, hash_cur->key.name)) { hash_del_rcu(&hash_cur->hlist); hlist_add_head(&hash_cur->dlist, &free_list); } } hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { - if (!strcasecmp(key, hash_cur->key)) { + if (!strcasecmp(key->name, hash_cur->key.name)) { hash_del_rcu(&hash_cur->hlist); hlist_add_head(&hash_cur->dlist, &free_list); break; @@ -329,7 +353,7 @@ static void remove_packagelist_entry_locked(const char *key) free_hashtable_entry(hash_cur); } -static void remove_packagelist_entry(const char *key) +static void remove_packagelist_entry(const struct qstr *key) { mutex_lock(&sdcardfs_super_list_lock); remove_packagelist_entry_locked(key); @@ -338,13 +362,13 @@ static void remove_packagelist_entry(const char *key) return; } -static void remove_ext_gid_entry_locked(const char *key, gid_t group) +static void remove_ext_gid_entry_locked(const struct qstr *key, gid_t group) { struct hashtable_entry *hash_cur; - unsigned int hash = str_hash(key); + unsigned int hash = key->hash; hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) { - if (!strcasecmp(key, hash_cur->key) && atomic_read(&hash_cur->value) == group) { + if (!strcasecmp(key->name, hash_cur->key.name) && atomic_read(&hash_cur->value) == group) { hash_del_rcu(&hash_cur->hlist); synchronize_rcu(); free_hashtable_entry(hash_cur); @@ -353,7 +377,7 @@ static void remove_ext_gid_entry_locked(const char *key, gid_t group) } } -static void remove_ext_gid_entry(const char *key, gid_t group) +static void remove_ext_gid_entry(const struct qstr *key, gid_t group) { mutex_lock(&sdcardfs_super_list_lock); remove_ext_gid_entry_locked(key, group); @@ -389,13 +413,14 @@ static void remove_userid_all_entry(userid_t userid) return; } -static void remove_userid_exclude_entry_locked(const char *key, userid_t userid) +static void remove_userid_exclude_entry_locked(const struct qstr *key, userid_t userid) { struct hashtable_entry *hash_cur; - unsigned int hash = str_hash(key); + unsigned int hash = key->hash; hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { - if (!strcasecmp(key, hash_cur->key) && atomic_read(&hash_cur->value) == userid) { + if (!strcasecmp(key->name, hash_cur->key.name) && + atomic_read(&hash_cur->value) == userid) { hash_del_rcu(&hash_cur->hlist); synchronize_rcu(); free_hashtable_entry(hash_cur); @@ -404,7 +429,7 @@ static void remove_userid_exclude_entry_locked(const char *key, userid_t userid) } } -static void remove_userid_exclude_entry(const char *key, userid_t userid) +static void remove_userid_exclude_entry(const struct qstr *key, userid_t userid) { mutex_lock(&sdcardfs_super_list_lock); remove_userid_exclude_entry_locked(key, userid); @@ -462,7 +487,7 @@ static struct configfs_attribute _pfx##attr_##_name = { \ struct package_details { struct config_item item; - const char *name; + struct qstr name; }; static inline struct package_details *to_package_details(struct config_item *item) @@ -472,7 +497,7 @@ static inline struct package_details *to_package_details(struct config_item *ite static ssize_t package_details_appid_show(struct config_item *item, char *page) { - return scnprintf(page, PAGE_SIZE, "%u\n", get_appid(to_package_details(item)->name)); + return scnprintf(page, PAGE_SIZE, "%u\n", __get_appid(&to_package_details(item)->name)); } static ssize_t package_details_appid_store(struct config_item *item, @@ -485,7 +510,7 @@ static ssize_t package_details_appid_store(struct config_item *item, if (ret) return ret; - ret = insert_packagelist_entry(to_package_details(item)->name, tmp); + ret = insert_packagelist_entry(&to_package_details(item)->name, tmp); if (ret) return ret; @@ -498,12 +523,12 @@ static ssize_t package_details_excluded_userids_show(struct config_item *item, { struct package_details *package_details = to_package_details(item); struct hashtable_entry *hash_cur; - unsigned int hash = str_hash(package_details->name); + unsigned int hash = package_details->name.hash; int count = 0; rcu_read_lock(); hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { - if (!strcasecmp(package_details->name, hash_cur->key)) + if (!strcasecmp(package_details->name.name, hash_cur->key.name)) count += scnprintf(page + count, PAGE_SIZE - count, "%d ", atomic_read(&hash_cur->value)); } @@ -524,7 +549,7 @@ static ssize_t package_details_excluded_userids_store(struct config_item *item, if (ret) return ret; - ret = insert_userid_exclude_entry(to_package_details(item)->name, tmp); + ret = insert_userid_exclude_entry(&to_package_details(item)->name, tmp); if (ret) return ret; @@ -541,16 +566,16 @@ static ssize_t package_details_clear_userid_store(struct config_item *item, ret = kstrtouint(page, 10, &tmp); if (ret) return ret; - remove_userid_exclude_entry(to_package_details(item)->name, tmp); + remove_userid_exclude_entry(&to_package_details(item)->name, tmp); return count; } static void package_details_release(struct config_item *item) { struct package_details *package_details = to_package_details(item); - printk(KERN_INFO "sdcardfs: removing %s\n", package_details->name); - remove_packagelist_entry(package_details->name); - kfree(package_details->name); + printk(KERN_INFO "sdcardfs: removing %s\n", package_details->name.name); + remove_packagelist_entry(&package_details->name); + kfree(package_details->name.name); kfree(package_details); } @@ -582,7 +607,7 @@ struct extensions_value { struct extension_details { struct config_item item; - const char* name; + struct qstr name; unsigned int num; }; @@ -601,9 +626,9 @@ static void extension_details_release(struct config_item *item) struct extension_details *extension_details = to_extension_details(item); printk(KERN_INFO "sdcardfs: No longer mapping %s files to gid %d\n", - extension_details->name, extension_details->num); - remove_ext_gid_entry(extension_details->name, extension_details->num); - kfree(extension_details->name); + extension_details->name.name, extension_details->num); + remove_ext_gid_entry(&extension_details->name, extension_details->num); + kfree(extension_details->name.name); kfree(extension_details); } @@ -620,20 +645,21 @@ static struct config_item *extension_details_make_item(struct config_group *grou { struct extensions_value *extensions_value = to_extensions_value(&group->cg_item); struct extension_details *extension_details = kzalloc(sizeof(struct extension_details), GFP_KERNEL); + const char *tmp; int ret; if (!extension_details) return ERR_PTR(-ENOMEM); - extension_details->name = kstrdup(name, GFP_KERNEL); - if (!extension_details->name) { + tmp = kstrdup(name, GFP_KERNEL); + if (!tmp) { kfree(extension_details); return ERR_PTR(-ENOMEM); } - extension_details->num = extensions_value->num; - ret = insert_ext_gid_entry(name, extensions_value->num); + qstr_init(&extension_details->name, tmp); + ret = insert_ext_gid_entry(&extension_details->name, extensions_value->num); if (ret) { - kfree(extension_details->name); + kfree(extension_details->name.name); kfree(extension_details); return ERR_PTR(ret); } @@ -699,16 +725,17 @@ struct config_group extension_group = { static struct config_item *packages_make_item(struct config_group *group, const char *name) { struct package_details *package_details; + const char *tmp; package_details = kzalloc(sizeof(struct package_details), GFP_KERNEL); if (!package_details) return ERR_PTR(-ENOMEM); - package_details->name = kstrdup(name, GFP_KERNEL); - if (!package_details->name) { + tmp = kstrdup(name, GFP_KERNEL); + if (!tmp) { kfree(package_details); return ERR_PTR(-ENOMEM); } - + qstr_init(&package_details->name, tmp); config_item_init_type_name(&package_details->item, name, &package_appid_type); @@ -727,13 +754,13 @@ static ssize_t packages_list_show(struct config_item *item, char *page) rcu_read_lock(); hash_for_each_rcu(package_to_appid, i, hash_cur_app, hlist) { written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n", - hash_cur_app->key, atomic_read(&hash_cur_app->value)); - hash = str_hash(hash_cur_app->key); + hash_cur_app->key.name, atomic_read(&hash_cur_app->value)); + hash = hash_cur_app->key.hash; hash_for_each_possible_rcu(package_to_userid, hash_cur_user, hlist, hash) { - if (!strcasecmp(hash_cur_app->key, hash_cur_user->key)) { + if (!strcasecmp(hash_cur_app->key.name, hash_cur_user->key.name)) { written += scnprintf(page + count + written - 1, PAGE_SIZE - sizeof(errormsg) - count - written + 1, - " %d\n", atomic_read(&hash_cur_user->value)) - 1; + " %d\n", atomic_read(&hash_cur_user->value)) - 1; } } if (count + written == PAGE_SIZE - sizeof(errormsg) - 1) { From 9ce149a4581a1516f8bead77075821e2cf05ee21 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 31 Jan 2017 20:07:51 -0800 Subject: [PATCH 0641/3220] ANDROID: sdcardfs: Switch strcasecmp for internal call This moves our uses of strcasecmp over to an internal call so we can easily change implementations later if we so desire. Additionally, we leverage qstr's where appropriate to save time on comparisons. Change-Id: I32fdc4fd0cd3b7b735dcfd82f60a2516fd8272a5 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/derived_perm.c | 35 ++++++++++++++++++++------------- fs/sdcardfs/file.c | 2 +- fs/sdcardfs/inode.c | 24 ++++++++++++----------- fs/sdcardfs/lookup.c | 18 +++++++---------- fs/sdcardfs/packagelist.c | 40 ++++++++++++++++++++------------------ fs/sdcardfs/sdcardfs.h | 17 ++++++++++++++-- 6 files changed, 78 insertions(+), 58 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index d2bff5ecdad0..0bb442338a85 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -51,11 +51,16 @@ void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, } /* While renaming, there is a point where we want the path from dentry, but the name from newdentry */ -void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const char *name) +void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name) { struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry)); struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent)); appid_t appid; + struct qstr q_Android = QSTR_LITERAL("Android"); + struct qstr q_data = QSTR_LITERAL("data"); + struct qstr q_obb = QSTR_LITERAL("obb"); + struct qstr q_media = QSTR_LITERAL("media"); + struct qstr q_cache = QSTR_LITERAL("cache"); /* By default, each inode inherits from its parent. * the properties are maintained on its private fields @@ -79,12 +84,12 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, co case PERM_PRE_ROOT: /* Legacy internal layout places users at top level */ info->perm = PERM_ROOT; - info->userid = simple_strtoul(name, NULL, 10); + info->userid = simple_strtoul(name->name, NULL, 10); set_top(info, &info->vfs_inode); break; case PERM_ROOT: /* Assume masked off by default. */ - if (!strcasecmp(name, "Android")) { + if (qstr_case_eq(name, &q_Android)) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID; info->under_android = true; @@ -92,17 +97,17 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, co } break; case PERM_ANDROID: - if (!strcasecmp(name, "data")) { + if (qstr_case_eq(name, &q_data)) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_DATA; set_top(info, &info->vfs_inode); - } else if (!strcasecmp(name, "obb")) { + } else if (qstr_case_eq(name, &q_obb)) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_OBB; info->under_obb = true; set_top(info, &info->vfs_inode); /* Single OBB directory is always shared */ - } else if (!strcasecmp(name, "media")) { + } else if (qstr_case_eq(name, &q_media)) { /* App-specific directories inside; let anyone traverse */ info->perm = PERM_ANDROID_MEDIA; set_top(info, &info->vfs_inode); @@ -112,14 +117,14 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, co case PERM_ANDROID_DATA: case PERM_ANDROID_MEDIA: info->perm = PERM_ANDROID_PACKAGE; - appid = get_appid(name); - if (appid != 0 && !is_excluded(name, parent_info->userid)) { + appid = get_appid(name->name); + if (appid != 0 && !is_excluded(name->name, parent_info->userid)) { info->d_uid = multiuser_get_uid(parent_info->userid, appid); } set_top(info, &info->vfs_inode); break; case PERM_ANDROID_PACKAGE: - if (!strcasecmp(name, "cache")) { + if (qstr_case_eq(name, &q_cache)) { info->perm = PERM_ANDROID_PACKAGE_CACHE; info->under_cache = true; } @@ -129,7 +134,7 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, co void get_derived_permission(struct dentry *parent, struct dentry *dentry) { - get_derived_permission_new(parent, dentry, dentry->d_name.name); + get_derived_permission_new(parent, dentry, &dentry->d_name); } static appid_t get_type(const char *name) { @@ -360,9 +365,10 @@ int need_graft_path(struct dentry *dentry) struct dentry *parent = dget_parent(dentry); struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent)); struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + struct qstr obb = QSTR_LITERAL("obb"); if(parent_info->perm == PERM_ANDROID && - !strcasecmp(dentry->d_name.name, "obb")) { + qstr_case_eq(&dentry->d_name, &obb)) { /* /Android/obb is the base obbpath of DERIVED_UNIFIED */ if(!(sbi->options.multiuser == false @@ -399,7 +405,7 @@ int is_obbpath_invalid(struct dentry *dent) } else { obbpath_s = d_path(&di->lower_path, path_buf, PATH_MAX); if (d_unhashed(di->lower_path.dentry) || - strcasecmp(sbi->obbpath_s, obbpath_s)) { + !str_case_eq(sbi->obbpath_s, obbpath_s)) { ret = 1; } kfree(path_buf); @@ -419,15 +425,16 @@ int is_base_obbpath(struct dentry *dentry) struct dentry *parent = dget_parent(dentry); struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent)); struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); + struct qstr q_obb = QSTR_LITERAL("obb"); spin_lock(&SDCARDFS_D(dentry)->lock); if (sbi->options.multiuser) { if(parent_info->perm == PERM_PRE_ROOT && - !strcasecmp(dentry->d_name.name, "obb")) { + qstr_case_eq(&dentry->d_name, &q_obb)) { ret = 1; } } else if (parent_info->perm == PERM_ANDROID && - !strcasecmp(dentry->d_name.name, "obb")) { + qstr_case_eq(&dentry->d_name, &q_obb)) { ret = 1; } spin_unlock(&SDCARDFS_D(dentry)->lock); diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 006c6ff57ad7..23f8cd7f8877 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -216,7 +216,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file) goto out_err; } - if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) { + if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index cb0588691a0f..68e615045616 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -66,7 +66,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, struct fs_struct *saved_fs; struct fs_struct *copied_fs; - if(!check_caller_access_to_name(dir, dentry->d_name.name)) { + if(!check_caller_access_to_name(dir, &dentry->d_name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -168,7 +168,7 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) struct path lower_path; const struct cred *saved_cred = NULL; - if(!check_caller_access_to_name(dir, dentry->d_name.name)) { + if(!check_caller_access_to_name(dir, &dentry->d_name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -275,8 +275,10 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode int touch_err = 0; struct fs_struct *saved_fs; struct fs_struct *copied_fs; + struct qstr q_obb = QSTR_LITERAL("obb"); + struct qstr q_data = QSTR_LITERAL("data"); - if(!check_caller_access_to_name(dir, dentry->d_name.name)) { + if(!check_caller_access_to_name(dir, &dentry->d_name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -351,13 +353,13 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink); fixup_lower_ownership(dentry, dentry->d_name.name); unlock_dir(lower_parent_dentry); - if ((!sbi->options.multiuser) && (!strcasecmp(dentry->d_name.name, "obb")) + if ((!sbi->options.multiuser) && (qstr_case_eq(&dentry->d_name, &q_obb)) && (pi->perm == PERM_ANDROID) && (pi->userid == 0)) make_nomedia_in_obb = 1; /* When creating /Android/data and /Android/obb, mark them as .nomedia */ if (make_nomedia_in_obb || - ((pi->perm == PERM_ANDROID) && (!strcasecmp(dentry->d_name.name, "data")))) { + ((pi->perm == PERM_ANDROID) && (qstr_case_eq(&dentry->d_name, &q_data)))) { REVERT_CRED(saved_cred); OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(d_inode(dentry))); set_fs_pwd(current->fs, &lower_path); @@ -388,7 +390,7 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) struct path lower_path; const struct cred *saved_cred = NULL; - if(!check_caller_access_to_name(dir, dentry->d_name.name)) { + if(!check_caller_access_to_name(dir, &dentry->d_name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -476,8 +478,8 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct path lower_old_path, lower_new_path; const struct cred *saved_cred = NULL; - if(!check_caller_access_to_name(old_dir, old_dentry->d_name.name) || - !check_caller_access_to_name(new_dir, new_dentry->d_name.name)) { + if(!check_caller_access_to_name(old_dir, &old_dentry->d_name) || + !check_caller_access_to_name(new_dir, &new_dentry->d_name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " new_dentry: %s, task:%s\n", __func__, new_dentry->d_name.name, current->comm); @@ -523,7 +525,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, sdcardfs_copy_and_fix_attrs(old_dir, d_inode(lower_old_dir_dentry)); fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry)); } - get_derived_permission_new(new_dentry->d_parent, old_dentry, new_dentry->d_name.name); + get_derived_permission_new(new_dentry->d_parent, old_dentry, &new_dentry->d_name); fixup_tmp_permissions(d_inode(old_dentry)); fixup_lower_ownership(old_dentry, new_dentry->d_name.name); drop_recursive(old_dentry); /* Can't fixup ownership recursively :( */ @@ -743,7 +745,7 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct if (!err) { /* check the Android group ID */ parent = dget_parent(dentry); - if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) { + if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); @@ -861,7 +863,7 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, int err; parent = dget_parent(dentry); - if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) { + if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", __func__, dentry->d_name.name, current->comm); diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 3c9454e5e1c6..9135866b7766 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -219,9 +219,8 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, struct vfsmount *lower_dir_mnt; struct dentry *lower_dir_dentry = NULL; struct dentry *lower_dentry; - const char *name; + const struct qstr *name; struct path lower_path; - struct qstr this; struct sdcardfs_sb_info *sbi; sbi = SDCARDFS_SB(dentry->d_sb); @@ -231,14 +230,14 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, if (IS_ROOT(dentry)) goto out; - name = dentry->d_name.name; + name = &dentry->d_name; /* now start the actual lookup procedure */ lower_dir_dentry = lower_parent_path->dentry; lower_dir_mnt = lower_parent_path->mnt; /* Use vfs_path_lookup to check if the dentry exists or not */ - err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, 0, + err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name->name, 0, &lower_path); /* check for other cases */ if (err == -ENOENT) { @@ -248,7 +247,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, spin_lock(&lower_dir_dentry->d_lock); list_for_each_entry(child, &lower_dir_dentry->d_subdirs, d_child) { if (child && d_inode(child)) { - if (strcasecmp(child->d_name.name, name)==0) { + if (qstr_case_eq(&child->d_name, name)) { match = dget(child); break; } @@ -307,14 +306,11 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, goto out; /* instatiate a new negative dentry */ - this.name = name; - this.len = strlen(name); - this.hash = full_name_hash(this.name, this.len); - lower_dentry = d_lookup(lower_dir_dentry, &this); + lower_dentry = d_lookup(lower_dir_dentry, name); if (lower_dentry) goto setup_lower; - lower_dentry = d_alloc(lower_dir_dentry, &this); + lower_dentry = d_alloc(lower_dir_dentry, name); if (!lower_dentry) { err = -ENOMEM; goto out; @@ -359,7 +355,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, parent = dget_parent(dentry); - if(!check_caller_access_to_name(d_inode(parent), dentry->d_name.name)) { + if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { ret = ERR_PTR(-EACCES); printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" " dentry: %s, task:%s\n", diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index b02feef08d51..d96fcde041cc 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -65,7 +65,7 @@ static appid_t __get_appid(const struct qstr *key) rcu_read_lock(); hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { - if (!strcasecmp(key->name, hash_cur->key.name)) { + if (qstr_case_eq(key, &hash_cur->key)) { ret_id = atomic_read(&hash_cur->value); rcu_read_unlock(); return ret_id; @@ -90,7 +90,7 @@ static appid_t __get_ext_gid(const struct qstr *key) rcu_read_lock(); hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) { - if (!strcasecmp(key->name, hash_cur->key.name)) { + if (qstr_case_eq(key, &hash_cur->key)) { ret_id = atomic_read(&hash_cur->value); rcu_read_unlock(); return ret_id; @@ -115,7 +115,7 @@ static appid_t __is_excluded(const struct qstr *app_name, userid_t user) rcu_read_lock(); hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { if (atomic_read(&hash_cur->value) == user && - !strcasecmp(app_name->name, hash_cur->key.name)) { + qstr_case_eq(app_name, &hash_cur->key)) { rcu_read_unlock(); return 1; } @@ -124,24 +124,26 @@ static appid_t __is_excluded(const struct qstr *app_name, userid_t user) return 0; } -appid_t is_excluded(const char *app_name, userid_t user) +appid_t is_excluded(const char *key, userid_t user) { struct qstr q; - qstr_init(&q, app_name); + qstr_init(&q, key); return __is_excluded(&q, user); } - /* Kernel has already enforced everything we returned through * derive_permissions_locked(), so this is used to lock down access * even further, such as enforcing that apps hold sdcard_rw. */ -int check_caller_access_to_name(struct inode *parent_node, const char* name) { +int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name) { + struct qstr q_autorun = QSTR_LITERAL("autorun.inf"); + struct qstr q__android_secure = QSTR_LITERAL(".android_secure"); + struct qstr q_android_secure = QSTR_LITERAL("android_secure"); /* Always block security-sensitive files at root */ if (parent_node && SDCARDFS_I(parent_node)->perm == PERM_ROOT) { - if (!strcasecmp(name, "autorun.inf") - || !strcasecmp(name, ".android_secure") - || !strcasecmp(name, "android_secure")) { + if (qstr_case_eq(name, &q_autorun) + || qstr_case_eq(name, &q__android_secure) + || qstr_case_eq(name, &q_android_secure)) { return 0; } } @@ -193,7 +195,7 @@ static int insert_packagelist_appid_entry_locked(const struct qstr *key, appid_t unsigned int hash = key->hash; hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { - if (!strcasecmp(key->name, hash_cur->key.name)) { + if (qstr_case_eq(key, &hash_cur->key)) { atomic_set(&hash_cur->value, value); return 0; } @@ -213,7 +215,7 @@ static int insert_ext_gid_entry_locked(const struct qstr *key, appid_t value) /* An extension can only belong to one gid */ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) { - if (!strcasecmp(key->name, hash_cur->key.name)) + if (qstr_case_eq(key, &hash_cur->key)) return -EINVAL; } new_entry = alloc_hashtable_entry(key, value); @@ -232,7 +234,7 @@ static int insert_userid_exclude_entry_locked(const struct qstr *key, userid_t v /* Only insert if not already present */ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { if (atomic_read(&hash_cur->value) == value && - !strcasecmp(key->name, hash_cur->key.name)) + qstr_case_eq(key, &hash_cur->key)) return 0; } new_entry = alloc_hashtable_entry(key, value); @@ -336,13 +338,13 @@ static void remove_packagelist_entry_locked(const struct qstr *key) HLIST_HEAD(free_list); hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { - if (!strcasecmp(key->name, hash_cur->key.name)) { + if (qstr_case_eq(key, &hash_cur->key)) { hash_del_rcu(&hash_cur->hlist); hlist_add_head(&hash_cur->dlist, &free_list); } } hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) { - if (!strcasecmp(key->name, hash_cur->key.name)) { + if (qstr_case_eq(key, &hash_cur->key)) { hash_del_rcu(&hash_cur->hlist); hlist_add_head(&hash_cur->dlist, &free_list); break; @@ -368,7 +370,7 @@ static void remove_ext_gid_entry_locked(const struct qstr *key, gid_t group) unsigned int hash = key->hash; hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) { - if (!strcasecmp(key->name, hash_cur->key.name) && atomic_read(&hash_cur->value) == group) { + if (qstr_case_eq(key, &hash_cur->key) && atomic_read(&hash_cur->value) == group) { hash_del_rcu(&hash_cur->hlist); synchronize_rcu(); free_hashtable_entry(hash_cur); @@ -419,7 +421,7 @@ static void remove_userid_exclude_entry_locked(const struct qstr *key, userid_t unsigned int hash = key->hash; hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { - if (!strcasecmp(key->name, hash_cur->key.name) && + if (qstr_case_eq(key, &hash_cur->key) && atomic_read(&hash_cur->value) == userid) { hash_del_rcu(&hash_cur->hlist); synchronize_rcu(); @@ -528,7 +530,7 @@ static ssize_t package_details_excluded_userids_show(struct config_item *item, rcu_read_lock(); hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) { - if (!strcasecmp(package_details->name.name, hash_cur->key.name)) + if (qstr_case_eq(&package_details->name, &hash_cur->key)) count += scnprintf(page + count, PAGE_SIZE - count, "%d ", atomic_read(&hash_cur->value)); } @@ -757,7 +759,7 @@ static ssize_t packages_list_show(struct config_item *item, char *page) hash_cur_app->key.name, atomic_read(&hash_cur_app->value)); hash = hash_cur_app->key.hash; hash_for_each_possible_rcu(package_to_userid, hash_cur_user, hlist, hash) { - if (!strcasecmp(hash_cur_app->key.name, hash_cur_user->key.name)) { + if (qstr_case_eq(&hash_cur_app->key, &hash_cur_user->key)) { written += scnprintf(page + count + written - 1, PAGE_SIZE - sizeof(errormsg) - count - written + 1, " %d\n", atomic_read(&hash_cur_user->value)) - 1; diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 03da961e3b09..f3cced313108 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -459,7 +459,7 @@ extern struct list_head sdcardfs_super_list; extern appid_t get_appid(const char *app_name); extern appid_t get_ext_gid(const char *app_name); extern appid_t is_excluded(const char *app_name, userid_t userid); -extern int check_caller_access_to_name(struct inode *parent_node, const char* name); +extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr* name); extern int open_flags_to_access_mode(int open_flags); extern int packagelist_init(void); extern void packagelist_exit(void); @@ -477,7 +477,7 @@ struct limit_search { extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, uid_t uid, bool under_android, struct inode *top); extern void get_derived_permission(struct dentry *parent, struct dentry *dentry); -extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const char *name); +extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name); extern void drop_recursive(struct dentry *parent); extern void fixup_top_recursive(struct dentry *parent); extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit); @@ -605,4 +605,17 @@ static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct dest->i_flags = src->i_flags; set_nlink(dest, src->i_nlink); } + +static inline bool str_case_eq(const char *s1, const char *s2) +{ + return !strcasecmp(s1, s2); +} + +static inline bool qstr_case_eq(const struct qstr *q1, const struct qstr *q2) +{ + return q1->len == q2->len && str_case_eq(q1->name, q2->name); +} + +#define QSTR_LITERAL(string) QSTR_INIT(string, sizeof(string)-1) + #endif /* not _SDCARDFS_H_ */ From 91495bc622e9356dd63a7c77b98a21f0e2f5d2b2 Mon Sep 17 00:00:00 2001 From: Pratyush Anand Date: Mon, 14 Nov 2016 19:32:42 +0530 Subject: [PATCH 0642/3220] BACKPORT: hw_breakpoint: Allow watchpoint of length 3,5,6 and 7 (cherry picked from commit 651be3cb085341a21847e47c694c249c3e1e4e5b) We only support breakpoint/watchpoint of length 1, 2, 4 and 8. If we can support other length as well, then user may watch more data with less number of watchpoints (provided hardware supports it). For example: if we have to watch only 4th, 5th and 6th byte from a 64 bit aligned address, we will have to use two slots to implement it currently. One slot will watch a half word at offset 4 and other a byte at offset 6. If we can have a watchpoint of length 3 then we can watch it with single slot as well. ARM64 hardware does support such functionality, therefore adding these new definitions in generic layer. Signed-off-by: Pratyush Anand Signed-off-by: Will Deacon Signed-off-by: Pavel Labath [pavel: tools/include/uapi/linux/hw_breakpoint.h is not present in this branch] Change-Id: Ie17ed89ca526e4fddf591bb4e556fdfb55fc2eac Bug: 30919905 --- include/uapi/linux/hw_breakpoint.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/hw_breakpoint.h b/include/uapi/linux/hw_breakpoint.h index b04000a2296a..2b65efd19a46 100644 --- a/include/uapi/linux/hw_breakpoint.h +++ b/include/uapi/linux/hw_breakpoint.h @@ -4,7 +4,11 @@ enum { HW_BREAKPOINT_LEN_1 = 1, HW_BREAKPOINT_LEN_2 = 2, + HW_BREAKPOINT_LEN_3 = 3, HW_BREAKPOINT_LEN_4 = 4, + HW_BREAKPOINT_LEN_5 = 5, + HW_BREAKPOINT_LEN_6 = 6, + HW_BREAKPOINT_LEN_7 = 7, HW_BREAKPOINT_LEN_8 = 8, }; From 7409857a0717fa78dc936ea08099880be893156c Mon Sep 17 00:00:00 2001 From: Pratyush Anand Date: Mon, 14 Nov 2016 19:32:43 +0530 Subject: [PATCH 0643/3220] UPSTREAM: arm64: Allow hw watchpoint at varied offset from base address ARM64 hardware supports watchpoint at any double word aligned address. However, it can select any consecutive bytes from offset 0 to 7 from that base address. For example, if base address is programmed as 0x420030 and byte select is 0x1C, then access of 0x420032,0x420033 and 0x420034 will generate a watchpoint exception. Currently, we do not have such modularity. We can only program byte, halfword, word and double word access exception from any base address. This patch adds support to overcome above limitations. Signed-off-by: Pratyush Anand Signed-off-by: Will Deacon Signed-off-by: Pavel Labath Change-Id: I28b1ca63f63182c10c3d6b6b3bacf6c56887ddbe Bug: 30919905 --- arch/arm64/include/asm/hw_breakpoint.h | 2 +- arch/arm64/kernel/hw_breakpoint.c | 47 +++++++++++++------------- arch/arm64/kernel/ptrace.c | 7 ++-- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h index 9732908bfc8a..8acfd989a4e4 100644 --- a/arch/arm64/include/asm/hw_breakpoint.h +++ b/arch/arm64/include/asm/hw_breakpoint.h @@ -110,7 +110,7 @@ struct perf_event; struct pmu; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type); + int *gen_len, int *gen_type, int *offset); extern int arch_check_bp_in_kernelspace(struct perf_event *bp); extern int arch_validate_hwbkpt_settings(struct perf_event *bp); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index b45c95d34b83..21cb18ba9ff8 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -345,7 +345,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) * to generic breakpoint descriptions. */ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type) + int *gen_len, int *gen_type, int *offset) { /* Type */ switch (ctrl.type) { @@ -365,8 +365,12 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, return -EINVAL; } + if (!ctrl.len) + return -EINVAL; + *offset = __ffs(ctrl.len); + /* Len */ - switch (ctrl.len) { + switch (ctrl.len >> *offset) { case ARM_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; @@ -513,18 +517,17 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) default: return -EINVAL; } - - info->address &= ~alignment_mask; - info->ctrl.len <<= offset; } else { if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) alignment_mask = 0x3; else alignment_mask = 0x7; - if (info->address & alignment_mask) - return -EINVAL; + offset = info->address & alignment_mask; } + info->address &= ~alignment_mask; + info->ctrl.len <<= offset; + /* * Disallow per-task kernel breakpoints since these would * complicate the stepping code. @@ -659,8 +662,8 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { int i, step = 0, *kernel_step, access; - u32 ctrl_reg; - u64 val, alignment_mask; + u32 ctrl_reg, lens, lene; + u64 val; struct perf_event *wp, **slots; struct debug_info *debug_info; struct arch_hw_breakpoint *info; @@ -678,25 +681,21 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, goto unlock; info = counter_arch_bp(wp); - /* AArch32 watchpoints are either 4 or 8 bytes aligned. */ - if (is_compat_task()) { - if (info->ctrl.len == ARM_BREAKPOINT_LEN_8) - alignment_mask = 0x7; - else - alignment_mask = 0x3; - } else { - alignment_mask = 0x7; - } - /* Check if the watchpoint value matches. */ + /* Check if the watchpoint value and byte select match. */ val = read_wb_reg(AARCH64_DBG_REG_WVR, i); - if (val != (addr & ~alignment_mask)) - goto unlock; - - /* Possible match, check the byte address select to confirm. */ ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); decode_ctrl_reg(ctrl_reg, &ctrl); - if (!((1 << (addr & alignment_mask)) & ctrl.len)) + lens = ffs(ctrl.len) - 1; + lene = fls(ctrl.len) - 1; + /* + * FIXME: reported address can be anywhere between "the + * lowest address accessed by the memory access that + * triggered the watchpoint" and "the highest watchpointed + * address accessed by the memory access". So, it may not + * lie in the interval of watchpoint address range. + */ + if (addr < val + lens || addr > val + lene) goto unlock; /* diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 1971f491bb90..4db41b860c18 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -220,13 +220,13 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, struct arch_hw_breakpoint_ctrl ctrl, struct perf_event_attr *attr) { - int err, len, type, disabled = !ctrl.enabled; + int err, len, type, offset, disabled = !ctrl.enabled; attr->disabled = disabled; if (disabled) return 0; - err = arch_bp_generic_fields(ctrl, &len, &type); + err = arch_bp_generic_fields(ctrl, &len, &type, &offset); if (err) return err; @@ -245,6 +245,7 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, attr->bp_len = len; attr->bp_type = type; + attr->bp_addr += offset; return 0; } @@ -297,7 +298,7 @@ static int ptrace_hbp_get_addr(unsigned int note_type, if (IS_ERR(bp)) return PTR_ERR(bp); - *addr = bp ? bp->attr.bp_addr : 0; + *addr = bp ? counter_arch_bp(bp)->address : 0; return 0; } From b48318f371e8a8b98238deac868bc7af8ed8ba4b Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Mon, 14 Nov 2016 19:32:44 +0530 Subject: [PATCH 0644/3220] BACKPORT: arm64: hw_breakpoint: Handle inexact watchpoint addresses (cherry picked from commit fdfeff0f9e3d9be2b68fa02566017ffc581ae17b) Arm64 hardware does not always report a watchpoint hit address that matches one of the watchpoints set. It can also report an address "near" the watchpoint if a single instruction access both watched and unwatched addresses. There is no straight-forward way, short of disassembling the offending instruction, to map that address back to the watchpoint. Previously, when the hardware reported a watchpoint hit on an address that did not match our watchpoint (this happens in case of instructions which access large chunks of memory such as "stp") the process would enter a loop where we would be continually resuming it (because we did not recognise that watchpoint hit) and it would keep hitting the watchpoint again and again. The tracing process would never get notified of the watchpoint hit. This commit fixes the problem by looking at the watchpoints near the address reported by the hardware. If the address does not exactly match one of the watchpoints we have set, it attributes the hit to the nearest watchpoint we have. This heuristic is a bit dodgy, but I don't think we can do much more, given the hardware limitations. Signed-off-by: Pavel Labath [panand: reworked to rebase on his patches] Signed-off-by: Pratyush Anand [will: use __ffs instead of ffs - 1] Signed-off-by: Will Deacon Signed-off-by: Pavel Labath [pavel: trivial fixup in hw_breakpoint.c:watchpoint_handler] Change-Id: I714dfaa3947d89d89a9e9a1ea84914d44ba0faa3 Bug: 30919905 --- arch/arm64/kernel/hw_breakpoint.c | 98 ++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 28 deletions(-) diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 21cb18ba9ff8..6e02c8e4a8d0 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -658,11 +658,46 @@ unlock: return 0; } +/* + * Arm64 hardware does not always report a watchpoint hit address that matches + * one of the watchpoints set. It can also report an address "near" the + * watchpoint if a single instruction access both watched and unwatched + * addresses. There is no straight-forward way, short of disassembling the + * offending instruction, to map that address back to the watchpoint. This + * function computes the distance of the memory access from the watchpoint as a + * heuristic for the likelyhood that a given access triggered the watchpoint. + * + * See Section D2.10.5 "Determining the memory location that caused a Watchpoint + * exception" of ARMv8 Architecture Reference Manual for details. + * + * The function returns the distance of the address from the bytes watched by + * the watchpoint. In case of an exact match, it returns 0. + */ +static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, + struct arch_hw_breakpoint_ctrl *ctrl) +{ + u64 wp_low, wp_high; + u32 lens, lene; + + lens = __ffs(ctrl->len); + lene = __fls(ctrl->len); + + wp_low = val + lens; + wp_high = val + lene; + if (addr < wp_low) + return wp_low - addr; + else if (addr > wp_high) + return addr - wp_high; + else + return 0; +} + static int watchpoint_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - int i, step = 0, *kernel_step, access; - u32 ctrl_reg, lens, lene; + int i, step = 0, *kernel_step, access, closest_match = 0; + u64 min_dist = -1, dist; + u32 ctrl_reg; u64 val; struct perf_event *wp, **slots; struct debug_info *debug_info; @@ -672,31 +707,15 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, slots = this_cpu_ptr(wp_on_reg); debug_info = ¤t->thread.debug; + /* + * Find all watchpoints that match the reported address. If no exact + * match is found. Attribute the hit to the closest watchpoint. + */ + rcu_read_lock(); for (i = 0; i < core_num_wrps; ++i) { - rcu_read_lock(); - wp = slots[i]; - if (wp == NULL) - goto unlock; - - info = counter_arch_bp(wp); - - /* Check if the watchpoint value and byte select match. */ - val = read_wb_reg(AARCH64_DBG_REG_WVR, i); - ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); - decode_ctrl_reg(ctrl_reg, &ctrl); - lens = ffs(ctrl.len) - 1; - lene = fls(ctrl.len) - 1; - /* - * FIXME: reported address can be anywhere between "the - * lowest address accessed by the memory access that - * triggered the watchpoint" and "the highest watchpointed - * address accessed by the memory access". So, it may not - * lie in the interval of watchpoint address range. - */ - if (addr < val + lens || addr > val + lene) - goto unlock; + continue; /* * Check that the access type matches. @@ -705,18 +724,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W : HW_BREAKPOINT_R; if (!(access & hw_breakpoint_type(wp))) - goto unlock; + continue; + /* Check if the watchpoint value and byte select match. */ + val = read_wb_reg(AARCH64_DBG_REG_WVR, i); + ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); + decode_ctrl_reg(ctrl_reg, &ctrl); + dist = get_distance_from_watchpoint(addr, val, &ctrl); + if (dist < min_dist) { + min_dist = dist; + closest_match = i; + } + /* Is this an exact match? */ + if (dist != 0) + continue; + + info = counter_arch_bp(wp); info->trigger = addr; perf_bp_event(wp, regs); /* Do we need to handle the stepping? */ if (!wp->overflow_handler) step = 1; - -unlock: - rcu_read_unlock(); } + if (min_dist > 0 && min_dist != -1) { + /* No exact match found. */ + wp = slots[closest_match]; + info = counter_arch_bp(wp); + info->trigger = addr; + perf_bp_event(wp, regs); + + /* Do we need to handle the stepping? */ + if (!wp->overflow_handler) + step = 1; + } + rcu_read_unlock(); if (!step) return 0; From edc166a8714b012a3dd207e437c772ae2a264eca Mon Sep 17 00:00:00 2001 From: Pratyush Anand Date: Mon, 14 Nov 2016 19:32:45 +0530 Subject: [PATCH 0645/3220] UPSTREAM: arm64: Allow hw watchpoint of length 3,5,6 and 7 (cherry picked from commit 0ddb8e0b784ba034f3096d5a54684d0d73155e2a) Since, arm64 can support all offset within a double word limit. Therefore, now support other lengths within that range as well. Signed-off-by: Pratyush Anand Signed-off-by: Will Deacon Signed-off-by: Pavel Labath Change-Id: Ibcb263a3903572336ccbf96e0180d3990326545a Bug: 30919905 --- arch/arm64/include/asm/hw_breakpoint.h | 4 +++ arch/arm64/kernel/hw_breakpoint.c | 36 ++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h index 8acfd989a4e4..c72b8e201ab4 100644 --- a/arch/arm64/include/asm/hw_breakpoint.h +++ b/arch/arm64/include/asm/hw_breakpoint.h @@ -68,7 +68,11 @@ static inline void decode_ctrl_reg(u32 reg, /* Lengths */ #define ARM_BREAKPOINT_LEN_1 0x1 #define ARM_BREAKPOINT_LEN_2 0x3 +#define ARM_BREAKPOINT_LEN_3 0x7 #define ARM_BREAKPOINT_LEN_4 0xf +#define ARM_BREAKPOINT_LEN_5 0x1f +#define ARM_BREAKPOINT_LEN_6 0x3f +#define ARM_BREAKPOINT_LEN_7 0x7f #define ARM_BREAKPOINT_LEN_8 0xff /* Kernel stepping */ diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 6e02c8e4a8d0..2d2792a714ad 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -313,9 +313,21 @@ static int get_hbp_len(u8 hbp_len) case ARM_BREAKPOINT_LEN_2: len_in_bytes = 2; break; + case ARM_BREAKPOINT_LEN_3: + len_in_bytes = 3; + break; case ARM_BREAKPOINT_LEN_4: len_in_bytes = 4; break; + case ARM_BREAKPOINT_LEN_5: + len_in_bytes = 5; + break; + case ARM_BREAKPOINT_LEN_6: + len_in_bytes = 6; + break; + case ARM_BREAKPOINT_LEN_7: + len_in_bytes = 7; + break; case ARM_BREAKPOINT_LEN_8: len_in_bytes = 8; break; @@ -377,9 +389,21 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, case ARM_BREAKPOINT_LEN_2: *gen_len = HW_BREAKPOINT_LEN_2; break; + case ARM_BREAKPOINT_LEN_3: + *gen_len = HW_BREAKPOINT_LEN_3; + break; case ARM_BREAKPOINT_LEN_4: *gen_len = HW_BREAKPOINT_LEN_4; break; + case ARM_BREAKPOINT_LEN_5: + *gen_len = HW_BREAKPOINT_LEN_5; + break; + case ARM_BREAKPOINT_LEN_6: + *gen_len = HW_BREAKPOINT_LEN_6; + break; + case ARM_BREAKPOINT_LEN_7: + *gen_len = HW_BREAKPOINT_LEN_7; + break; case ARM_BREAKPOINT_LEN_8: *gen_len = HW_BREAKPOINT_LEN_8; break; @@ -423,9 +447,21 @@ static int arch_build_bp_info(struct perf_event *bp) case HW_BREAKPOINT_LEN_2: info->ctrl.len = ARM_BREAKPOINT_LEN_2; break; + case HW_BREAKPOINT_LEN_3: + info->ctrl.len = ARM_BREAKPOINT_LEN_3; + break; case HW_BREAKPOINT_LEN_4: info->ctrl.len = ARM_BREAKPOINT_LEN_4; break; + case HW_BREAKPOINT_LEN_5: + info->ctrl.len = ARM_BREAKPOINT_LEN_5; + break; + case HW_BREAKPOINT_LEN_6: + info->ctrl.len = ARM_BREAKPOINT_LEN_6; + break; + case HW_BREAKPOINT_LEN_7: + info->ctrl.len = ARM_BREAKPOINT_LEN_7; + break; case HW_BREAKPOINT_LEN_8: info->ctrl.len = ARM_BREAKPOINT_LEN_8; break; From 0213f79a418915f2db36572ae86b748438ed635b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2016 00:34:58 -0400 Subject: [PATCH 0646/3220] UPSTREAM: udp: properly support MSG_PEEK with truncated buffers [ Upstream commit 197c949e7798fbf28cfadc69d9ca0c2abbf93191 ] Backport of this upstream commit into stable kernels : 89c22d8c3b27 ("net: Fix skb csum races when peeking") exposed a bug in udp stack vs MSG_PEEK support, when user provides a buffer smaller than skb payload. In this case, skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); returns -EFAULT. This bug does not happen in upstream kernels since Al Viro did a great job to replace this into : skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg); This variant is safe vs short buffers. For the time being, instead reverting Herbert Xu patch and add back skb->ip_summed invalid changes, simply store the result of udp_lib_checksum_complete() so that we avoid computing the checksum a second time, and avoid the problematic skb_copy_and_csum_datagram_iovec() call. This patch can be applied on recent kernels as it avoids a double checksumming, then backported to stable kernels as a bug fix. Signed-off-by: Eric Dumazet Acked-by: Herbert Xu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman (cherry picked from commit dfe2042d96065f044a794f684e9f7976a4ca6e24) Bug: 32813456 --- net/ipv4/udp.c | 6 ++++-- net/ipv6/udp.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8acf544794a1..8271c01be33e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1274,6 +1274,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int peeked, off = 0; int err; int is_udplite = IS_UDPLITE(sk); + bool checksum_valid = false; bool slow; if (flags & MSG_ERRQUEUE) @@ -1299,11 +1300,12 @@ try_again: */ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { - if (udp_lib_checksum_complete(skb)) + checksum_valid = !udp_lib_checksum_complete(skb); + if (!checksum_valid) goto csum_copy_err; } - if (skb_csum_unnecessary(skb)) + if (checksum_valid || skb_csum_unnecessary(skb)) err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), msg, copied); else { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0890fd6d4248..d46bb67c3001 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -402,6 +402,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int peeked, off = 0; int err; int is_udplite = IS_UDPLITE(sk); + bool checksum_valid = false; int is_udp4; bool slow; @@ -433,11 +434,12 @@ try_again: */ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { - if (udp_lib_checksum_complete(skb)) + checksum_valid = !udp_lib_checksum_complete(skb); + if (!checksum_valid) goto csum_copy_err; } - if (skb_csum_unnecessary(skb)) + if (checksum_valid || skb_csum_unnecessary(skb)) err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), msg, copied); else { From 49b60d4aa95aa0519238a06fde5c838146742796 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 19 Sep 2016 17:39:09 +0200 Subject: [PATCH 0647/3220] BACKPORT: posix_acl: Clear SGID bit when setting file permissions (cherry pick from commit 073931017b49d9458aa351605b43a7e34598caef) When file permissions are modified via chmod(2) and the user is not in the owning group or capable of CAP_FSETID, the setgid bit is cleared in inode_change_ok(). Setting a POSIX ACL via setxattr(2) sets the file permissions as well as the new ACL, but doesn't clear the setgid bit in a similar way; this allows to bypass the check in chmod(2). Fix that. NB: We did not resolve the ACL leak in this CL, require additional upstream fix. References: CVE-2016-7097 Reviewed-by: Christoph Hellwig Reviewed-by: Jeff Layton Signed-off-by: Jan Kara Signed-off-by: Andreas Gruenbacher Bug: 32458736 Change-Id: I19591ad452cc825ac282b3cfd2daaa72aa9a1ac1 --- fs/9p/acl.c | 40 +++++++++++++++++---------------------- fs/btrfs/acl.c | 6 ++---- fs/ceph/acl.c | 6 ++---- fs/ext2/acl.c | 12 ++++-------- fs/ext4/acl.c | 12 ++++-------- fs/f2fs/acl.c | 6 ++---- fs/gfs2/acl.c | 12 +++--------- fs/hfsplus/posix_acl.c | 4 ++-- fs/jffs2/acl.c | 9 ++++----- fs/jfs/acl.c | 6 ++---- fs/ocfs2/acl.c | 10 ++++------ fs/posix_acl.c | 31 ++++++++++++++++++++++++++++++ fs/reiserfs/xattr_acl.c | 8 ++------ fs/xfs/xfs_acl.c | 13 ++++--------- include/linux/posix_acl.h | 1 + 15 files changed, 84 insertions(+), 92 deletions(-) diff --git a/fs/9p/acl.c b/fs/9p/acl.c index a7e28890f5ef..929b618da43b 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -282,32 +282,26 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, switch (handler->flags) { case ACL_TYPE_ACCESS: if (acl) { - umode_t mode = inode->i_mode; - retval = posix_acl_equiv_mode(acl, &mode); - if (retval < 0) + struct iattr iattr; + + retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); + if (retval) goto err_out; - else { - struct iattr iattr; - if (retval == 0) { - /* - * ACL can be represented - * by the mode bits. So don't - * update ACL. - */ - acl = NULL; - value = NULL; - size = 0; - } - /* Updte the mode bits */ - iattr.ia_mode = ((mode & S_IALLUGO) | - (inode->i_mode & ~S_IALLUGO)); - iattr.ia_valid = ATTR_MODE; - /* FIXME should we update ctime ? - * What is the following setxattr update the - * mode ? + if (!acl) { + /* + * ACL can be represented + * by the mode bits. So don't + * update ACL. */ - v9fs_vfs_setattr_dotl(dentry, &iattr); + value = NULL; + size = 0; } + iattr.ia_valid = ATTR_MODE; + /* FIXME should we update ctime ? + * What is the following setxattr update the + * mode ? + */ + v9fs_vfs_setattr_dotl(dentry, &iattr); } break; case ACL_TYPE_DEFAULT: diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 9a0124a95851..fb3e64d37cb4 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -83,11 +83,9 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &inode->i_mode); - if (ret < 0) + ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (ret) return ret; - if (ret == 0) - acl = NULL; } ret = 0; break; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 8f84646f10e9..4d8caeb94a11 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -94,11 +94,9 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &new_mode); - if (ret < 0) + ret = posix_acl_update_mode(inode, &new_mode, &acl); + if (ret) goto out; - if (ret == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 27695e6f4e46..d6aeb84e90b6 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -193,15 +193,11 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - if (error == 0) - acl = NULL; - } + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); } break; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 69b1e73026a5..c3fe1e323951 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -196,15 +196,11 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - if (error == 0) - acl = NULL; - } + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); } break; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index c8f25f7241f0..e9a8d676c6bc 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -214,12 +214,10 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; set_acl_inode(fi, inode->i_mode); - if (error == 0) - acl = NULL; } break; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 1be3b061c05c..ff0ac96a8e7b 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -79,17 +79,11 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - - if (error == 0) - acl = NULL; - - if (mode != inode->i_mode) { - inode->i_mode = mode; + if (mode != inode->i_mode) mark_inode_dirty(inode); - } } if (acl) { diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index df0c9af68d05..71b3087b7e32 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -68,8 +68,8 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, case ACL_TYPE_ACCESS: xattr_name = POSIX_ACL_XATTR_ACCESS; if (acl) { - err = posix_acl_equiv_mode(acl, &inode->i_mode); - if (err < 0) + err = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (err) return err; } err = 0; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 2f7a3c090489..f9f86f87d32b 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -235,9 +235,10 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; if (acl) { - umode_t mode = inode->i_mode; - rc = posix_acl_equiv_mode(acl, &mode); - if (rc < 0) + umode_t mode; + + rc = posix_acl_update_mode(inode, &mode, &acl); + if (rc) return rc; if (inode->i_mode != mode) { struct iattr attr; @@ -249,8 +250,6 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (rc < 0) return rc; } - if (rc == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 0c8ca830b113..9fad9f4fe883 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -84,13 +84,11 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, case ACL_TYPE_ACCESS: ea_name = POSIX_ACL_XATTR_ACCESS; if (acl) { - rc = posix_acl_equiv_mode(acl, &inode->i_mode); - if (rc < 0) + rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (rc) return rc; inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - if (rc == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 0cdf497c91ef..18f0c9afab66 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -241,13 +241,11 @@ int ocfs2_set_acl(handle_t *handle, case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - umode_t mode = inode->i_mode; - ret = posix_acl_equiv_mode(acl, &mode); - if (ret < 0) - return ret; + umode_t mode; - if (ret == 0) - acl = NULL; + ret = posix_acl_update_mode(inode, &mode, &acl); + if (ret) + return ret; ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 4adde1e2cbec..31f6a17327a8 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -592,6 +592,37 @@ no_mem: } EXPORT_SYMBOL_GPL(posix_acl_create); +/** + * posix_acl_update_mode - update mode in set_acl + * + * Update the file mode when setting an ACL: compute the new file permission + * bits based on the ACL. In addition, if the ACL is equivalent to the new + * file mode, set *acl to NULL to indicate that no ACL should be set. + * + * As with chmod, clear the setgit bit if the caller is not in the owning group + * or capable of CAP_FSETID (see inode_change_ok). + * + * Called from set_acl inode operations. + */ +int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} +EXPORT_SYMBOL(posix_acl_update_mode); + /* * Fix up the uids and gids in posix acl extended attributes in place. */ diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 4b34b9dc03dd..9b1824f35501 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -246,13 +246,9 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - if (error == 0) - acl = NULL; - } } break; case ACL_TYPE_DEFAULT: diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 6bb470fbb8e8..c5101a3295d8 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -288,16 +288,11 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; if (type == ACL_TYPE_ACCESS) { - umode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - - if (error <= 0) { - acl = NULL; - - if (error < 0) - return error; - } + umode_t mode; + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + return error; error = xfs_set_mode(inode, mode); if (error) return error; diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 3e96a6a76103..d1a8ad7e5ae4 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -95,6 +95,7 @@ extern int set_posix_acl(struct inode *, int, struct posix_acl *); extern int posix_acl_chmod(struct inode *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); +extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **); extern int simple_set_acl(struct inode *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); From dc2ad0661d6967e307bcdfa9172f681d21ea9a61 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 13 Dec 2016 10:33:34 -0800 Subject: [PATCH 0648/3220] FROMLIST: 9p: fix a potential acl leak (https://lkml.org/lkml/2016/12/13/579) posix_acl_update_mode() could possibly clear 'acl', if so we leak the memory pointed by 'acl'. Save this pointer before calling posix_acl_update_mode() and release the memory if 'acl' really gets cleared. Reported-by: Mark Salyzyn Reviewed-by: Jan Kara Reviewed-by: Greg Kurz Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Cong Wang Bug: 32458736 Change-Id: Ia78da401e6fd1bfd569653bd2cd0ebd3f9c737a0 --- fs/9p/acl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 929b618da43b..c30c6ceac2c4 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -283,6 +283,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, case ACL_TYPE_ACCESS: if (acl) { struct iattr iattr; + struct posix_acl *old_acl = acl; retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); if (retval) @@ -293,6 +294,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, * by the mode bits. So don't * update ACL. */ + posix_acl_release(old_acl); value = NULL; size = 0; } From d50112645714fc60c8f144710cfb595181494a8d Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 6 Feb 2017 14:27:24 -0800 Subject: [PATCH 0649/3220] ANDROID: android-recommended.cfg: CONFIG_CPU_SW_DOMAIN_PAN=y Bug: 31374660 Change-Id: Id2710a5fa2694da66d3f34cbcc0c2a58a006cec5 Signed-off-by: Sami Tolvanen --- android/configs/android-recommended.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 70aaae17ad29..28610303db60 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -15,6 +15,7 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y +CONFIG_CPU_SW_DOMAIN_PAN=y CONFIG_DEBUG_RODATA=y CONFIG_DM_UEVENT=y CONFIG_DRAGONRISE_FF=y From 50dcddba6c11f2156b4dfefa87e86d961f46b89e Mon Sep 17 00:00:00 2001 From: Adrien Schildknecht Date: Wed, 28 Sep 2016 12:14:39 -0700 Subject: [PATCH 0650/3220] Squashfs: remove the FILE_CACHE option FILE_DIRECT is working fine and offers faster results and lower memory footprint. Removing FILE_CACHE makes our life easier because we don't have to maintain 2 differents function that does the same thing. Signed-off-by: Adrien Schildknecht Change-Id: I6689ba74d0042c222a806f9edc539995e8e04c6b --- fs/squashfs/Kconfig | 28 --------------------------- fs/squashfs/Makefile | 3 +-- fs/squashfs/file_cache.c | 38 ------------------------------------ fs/squashfs/page_actor.h | 42 +--------------------------------------- 4 files changed, 2 insertions(+), 109 deletions(-) delete mode 100644 fs/squashfs/file_cache.c diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index ffb093e72b6c..6dd158a216f4 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -25,34 +25,6 @@ config SQUASHFS If unsure, say N. -choice - prompt "File decompression options" - depends on SQUASHFS - help - Squashfs now supports two options for decompressing file - data. Traditionally Squashfs has decompressed into an - intermediate buffer and then memcopied it into the page cache. - Squashfs now supports the ability to decompress directly into - the page cache. - - If unsure, select "Decompress file data into an intermediate buffer" - -config SQUASHFS_FILE_CACHE - bool "Decompress file data into an intermediate buffer" - help - Decompress file data into an intermediate buffer and then - memcopy it into the page cache. - -config SQUASHFS_FILE_DIRECT - bool "Decompress files directly into the page cache" - help - Directly decompress file data into the page cache. - Doing so can significantly improve performance because - it eliminates a memcpy and it also removes the lock contention - on the single buffer. - -endchoice - choice prompt "Decompressor parallelisation options" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 246a6f329d89..fe51f1507ed1 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,8 +5,7 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o squashfs-y += namei.o super.o symlink.o decompressor.o -squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o -squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o +squashfs-y += file_direct.o page_actor.o squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c deleted file mode 100644 index f2310d2a2019..000000000000 --- a/fs/squashfs/file_cache.c +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2013 - * Phillip Lougher - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "squashfs_fs.h" -#include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" -#include "squashfs.h" - -/* Read separately compressed datablock and memcopy into page cache */ -int squashfs_readpage_block(struct page *page, u64 block, int bsize) -{ - struct inode *i = page->mapping->host; - struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, - block, bsize); - int res = buffer->error; - - if (res) - ERROR("Unable to read page, block %llx, size %x\n", block, - bsize); - else - squashfs_copy_cache(page, buffer, buffer->length, 0); - - squashfs_cache_put(buffer); - return res; -} diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 26dd82008b82..d2df0544e0df 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -8,46 +8,6 @@ * the COPYING file in the top-level directory. */ -#ifndef CONFIG_SQUASHFS_FILE_DIRECT -struct squashfs_page_actor { - void **page; - int pages; - int length; - int next_page; -}; - -static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, - int pages, int length) -{ - struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - - if (actor == NULL) - return NULL; - - actor->length = length ? : pages * PAGE_CACHE_SIZE; - actor->page = page; - actor->pages = pages; - actor->next_page = 0; - return actor; -} - -static inline void *squashfs_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->page[0]; -} - -static inline void *squashfs_next_page(struct squashfs_page_actor *actor) -{ - return actor->next_page == actor->pages ? NULL : - actor->page[actor->next_page++]; -} - -static inline void squashfs_finish_page(struct squashfs_page_actor *actor) -{ - /* empty */ -} -#else struct squashfs_page_actor { union { void **buffer; @@ -77,5 +37,5 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor) { actor->squashfs_finish_page(actor); } -#endif + #endif From 417aca479b15292460e981f969d3bcb7fb89517c Mon Sep 17 00:00:00 2001 From: Adrien Schildknecht Date: Wed, 28 Sep 2016 13:59:18 -0700 Subject: [PATCH 0651/3220] Squashfs: refactor page_actor This patch essentially does 3 things: 1/ Always use an array of page to store the data instead of a mix of buffers and pages. 2/ It is now possible to have 'holes' in a page actor, i.e. NULL pages in the array. When reading a block (default 128K), squashfs tries to grab all the pages covering this block. If a single page is up-to-date or locked, it falls back to using an intermediate buffer to do the read and then copy the pages in the actor. Allowing holes in the page actor remove the need for this intermediate buffer. 3/ Refactor the wrappers to share code that deals with page actors. Signed-off-by: Adrien Schildknecht Change-Id: I98128bed5d518cf31b67e788a85b275e9a323bec --- fs/squashfs/cache.c | 73 +++++-------- fs/squashfs/decompressor.c | 53 ++++----- fs/squashfs/file_direct.c | 4 +- fs/squashfs/lz4_wrapper.c | 32 +----- fs/squashfs/lzo_wrapper.c | 40 ++----- fs/squashfs/page_actor.c | 201 ++++++++++++++++++++++------------- fs/squashfs/page_actor.h | 64 +++++++---- fs/squashfs/squashfs_fs_sb.h | 2 +- fs/squashfs/xz_wrapper.c | 15 ++- fs/squashfs/zlib_wrapper.c | 14 ++- 10 files changed, 270 insertions(+), 228 deletions(-) diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 1cb70a0b2168..6785d086ab38 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -209,17 +209,14 @@ void squashfs_cache_put(struct squashfs_cache_entry *entry) */ void squashfs_cache_delete(struct squashfs_cache *cache) { - int i, j; + int i; if (cache == NULL) return; for (i = 0; i < cache->entries; i++) { - if (cache->entry[i].data) { - for (j = 0; j < cache->pages; j++) - kfree(cache->entry[i].data[j]); - kfree(cache->entry[i].data); - } + if (cache->entry[i].page) + free_page_array(cache->entry[i].page, cache->pages); kfree(cache->entry[i].actor); } @@ -236,7 +233,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache) struct squashfs_cache *squashfs_cache_init(char *name, int entries, int block_size) { - int i, j; + int i; struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) { @@ -268,22 +265,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, init_waitqueue_head(&cache->entry[i].wait_queue); entry->cache = cache; entry->block = SQUASHFS_INVALID_BLK; - entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL); - if (entry->data == NULL) { + entry->page = alloc_page_array(cache->pages, GFP_KERNEL); + if (!entry->page) { ERROR("Failed to allocate %s cache entry\n", name); goto cleanup; } - - for (j = 0; j < cache->pages; j++) { - entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); - if (entry->data[j] == NULL) { - ERROR("Failed to allocate %s buffer\n", name); - goto cleanup; - } - } - - entry->actor = squashfs_page_actor_init(entry->data, - cache->pages, 0); + entry->actor = squashfs_page_actor_init(entry->page, + cache->pages, 0, NULL); if (entry->actor == NULL) { ERROR("Failed to allocate %s cache entry\n", name); goto cleanup; @@ -314,18 +302,20 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry, return min(length, entry->length - offset); while (offset < entry->length) { - void *buff = entry->data[offset / PAGE_CACHE_SIZE] - + (offset % PAGE_CACHE_SIZE); + void *buff = kmap_atomic(entry->page[offset / PAGE_CACHE_SIZE]) + + (offset % PAGE_CACHE_SIZE); int bytes = min_t(int, entry->length - offset, PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE)); if (bytes >= remaining) { memcpy(buffer, buff, remaining); + kunmap_atomic(buff); remaining = 0; break; } memcpy(buffer, buff, bytes); + kunmap_atomic(buff); buffer += bytes; remaining -= bytes; offset += bytes; @@ -416,43 +406,38 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb, void *squashfs_read_table(struct super_block *sb, u64 block, int length) { int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - int i, res; - void *table, *buffer, **data; + struct page **page; + void *buff; + int res; struct squashfs_page_actor *actor; - table = buffer = kmalloc(length, GFP_KERNEL); - if (table == NULL) + page = alloc_page_array(pages, GFP_KERNEL); + if (!page) return ERR_PTR(-ENOMEM); - data = kcalloc(pages, sizeof(void *), GFP_KERNEL); - if (data == NULL) { + actor = squashfs_page_actor_init(page, pages, length, NULL); + if (actor == NULL) { res = -ENOMEM; goto failed; } - actor = squashfs_page_actor_init(data, pages, length); - if (actor == NULL) { - res = -ENOMEM; - goto failed2; - } - - for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) - data[i] = buffer; - res = squashfs_read_data(sb, block, length | SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor); - kfree(data); - kfree(actor); - if (res < 0) - goto failed; + goto failed2; - return table; + buff = kmalloc(length, GFP_KERNEL); + if (!buff) + goto failed2; + squashfs_actor_to_buf(actor, buff, length); + squashfs_page_actor_free(actor, 0); + free_page_array(page, pages); + return buff; failed2: - kfree(data); + squashfs_page_actor_free(actor, 0); failed: - kfree(table); + free_page_array(page, pages); return ERR_PTR(res); } diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index e9034bf6e5ae..7de35bf297aa 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -24,7 +24,8 @@ #include #include #include -#include +#include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -94,40 +95,44 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) static void *get_comp_opts(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; - void *buffer = NULL, *comp_opts; + void *comp_opts, *buffer = NULL; + struct page *page; struct squashfs_page_actor *actor = NULL; int length = 0; + if (!SQUASHFS_COMP_OPTS(flags)) + return squashfs_comp_opts(msblk, buffer, length); + /* * Read decompressor specific options from file system if present */ - if (SQUASHFS_COMP_OPTS(flags)) { - buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); - if (buffer == NULL) { - comp_opts = ERR_PTR(-ENOMEM); - goto out; - } - actor = squashfs_page_actor_init(&buffer, 1, 0); - if (actor == NULL) { - comp_opts = ERR_PTR(-ENOMEM); - goto out; - } + page = alloc_page(GFP_KERNEL); + if (!page) + return ERR_PTR(-ENOMEM); - length = squashfs_read_data(sb, - sizeof(struct squashfs_super_block), 0, NULL, actor); - - if (length < 0) { - comp_opts = ERR_PTR(length); - goto out; - } + actor = squashfs_page_actor_init(&page, 1, 0, NULL); + if (actor == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto actor_error; } - comp_opts = squashfs_comp_opts(msblk, buffer, length); + length = squashfs_read_data(sb, + sizeof(struct squashfs_super_block), 0, NULL, actor); -out: - kfree(actor); - kfree(buffer); + if (length < 0) { + comp_opts = ERR_PTR(length); + goto read_error; + } + + buffer = kmap_atomic(page); + comp_opts = squashfs_comp_opts(msblk, buffer, length); + kunmap_atomic(buffer); + +read_error: + squashfs_page_actor_free(actor, 0); +actor_error: + __free_page(page); return comp_opts; } diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 43e7a7eddac0..9d033ec0f133 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -52,7 +52,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) * Create a "page actor" which will kmap and kunmap the * page cache pages appropriately within the decompressor */ - actor = squashfs_page_actor_init_special(page, pages, 0); + actor = squashfs_page_actor_init(page, pages, 0, NULL); if (actor == NULL) goto out; @@ -131,7 +131,7 @@ mark_errored: } out: - kfree(actor); + squashfs_page_actor_free(actor, 0); kfree(page); return res; } diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index c31e2bc9c081..df4fa3c7ddd0 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -94,39 +94,17 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { - struct squashfs_lz4 *stream = strm; - void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int res; size_t dest_len = output->length; + struct squashfs_lz4 *stream = strm; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); - buff += avail; - bytes -= avail; - offset = 0; - put_bh(bh[i]); - } - + squashfs_bh_to_buf(bh, b, stream->input, offset, length, + msblk->devblksize); res = lz4_decompress_unknownoutputsize(stream->input, length, stream->output, &dest_len); if (res) return -EIO; - - bytes = dest_len; - data = squashfs_first_page(output); - buff = stream->output; - while (data) { - if (bytes <= PAGE_CACHE_SIZE) { - memcpy(data, buff, bytes); - break; - } - memcpy(data, buff, PAGE_CACHE_SIZE); - buff += PAGE_CACHE_SIZE; - bytes -= PAGE_CACHE_SIZE; - data = squashfs_next_page(output); - } - squashfs_finish_page(output); + squashfs_buf_to_actor(stream->output, output, dest_len); return dest_len; } diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 244b9fbfff7b..2c844d53a59e 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -79,45 +79,19 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { - struct squashfs_lzo *stream = strm; - void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int res; size_t out_len = output->length; + struct squashfs_lzo *stream = strm; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); - buff += avail; - bytes -= avail; - offset = 0; - put_bh(bh[i]); - } - + squashfs_bh_to_buf(bh, b, stream->input, offset, length, + msblk->devblksize); res = lzo1x_decompress_safe(stream->input, (size_t)length, stream->output, &out_len); if (res != LZO_E_OK) - goto failed; + return -EIO; + squashfs_buf_to_actor(stream->output, output, out_len); - res = bytes = (int)out_len; - data = squashfs_first_page(output); - buff = stream->output; - while (data) { - if (bytes <= PAGE_CACHE_SIZE) { - memcpy(data, buff, bytes); - break; - } else { - memcpy(data, buff, PAGE_CACHE_SIZE); - buff += PAGE_CACHE_SIZE; - bytes -= PAGE_CACHE_SIZE; - data = squashfs_next_page(output); - } - } - squashfs_finish_page(output); - - return res; - -failed: - return -EIO; + return out_len; } const struct squashfs_decompressor squashfs_lzo_comp_ops = { diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 5a1c11f56441..53863508e400 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -9,79 +9,11 @@ #include #include #include +#include #include "page_actor.h" -/* - * This file contains implementations of page_actor for decompressing into - * an intermediate buffer, and for decompressing directly into the - * page cache. - * - * Calling code should avoid sleeping between calls to squashfs_first_page() - * and squashfs_finish_page(). - */ - -/* Implementation of page_actor for decompressing into intermediate buffer */ -static void *cache_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->buffer[0]; -} - -static void *cache_next_page(struct squashfs_page_actor *actor) -{ - if (actor->next_page == actor->pages) - return NULL; - - return actor->buffer[actor->next_page++]; -} - -static void cache_finish_page(struct squashfs_page_actor *actor) -{ - /* empty */ -} - -struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, - int pages, int length) -{ - struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - - if (actor == NULL) - return NULL; - - actor->length = length ? : pages * PAGE_CACHE_SIZE; - actor->buffer = buffer; - actor->pages = pages; - actor->next_page = 0; - actor->squashfs_first_page = cache_first_page; - actor->squashfs_next_page = cache_next_page; - actor->squashfs_finish_page = cache_finish_page; - return actor; -} - -/* Implementation of page_actor for decompressing directly into page cache. */ -static void *direct_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->pageaddr = kmap_atomic(actor->page[0]); -} - -static void *direct_next_page(struct squashfs_page_actor *actor) -{ - if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); - - return actor->pageaddr = actor->next_page == actor->pages ? NULL : - kmap_atomic(actor->page[actor->next_page++]); -} - -static void direct_finish_page(struct squashfs_page_actor *actor) -{ - if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); -} - -struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, - int pages, int length) +struct squashfs_page_actor *squashfs_page_actor_init(struct page **page, + int pages, int length, void (*release_pages)(struct page **, int, int)) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); @@ -93,8 +25,129 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, actor->pages = pages; actor->next_page = 0; actor->pageaddr = NULL; - actor->squashfs_first_page = direct_first_page; - actor->squashfs_next_page = direct_next_page; - actor->squashfs_finish_page = direct_finish_page; + actor->release_pages = release_pages; return actor; } + +void squashfs_page_actor_free(struct squashfs_page_actor *actor, int error) +{ + if (!actor) + return; + + if (actor->release_pages) + actor->release_pages(actor->page, actor->pages, error); + kfree(actor); +} + +void squashfs_actor_to_buf(struct squashfs_page_actor *actor, void *buf, + int length) +{ + void *pageaddr; + int pos = 0, avail, i; + + for (i = 0; i < actor->pages && pos < length; ++i) { + avail = min_t(int, length - pos, PAGE_CACHE_SIZE); + if (actor->page[i]) { + pageaddr = kmap_atomic(actor->page[i]); + memcpy(buf + pos, pageaddr, avail); + kunmap_atomic(pageaddr); + } + pos += avail; + } +} + +void squashfs_buf_to_actor(void *buf, struct squashfs_page_actor *actor, + int length) +{ + void *pageaddr; + int pos = 0, avail, i; + + for (i = 0; i < actor->pages && pos < length; ++i) { + avail = min_t(int, length - pos, PAGE_CACHE_SIZE); + if (actor->page[i]) { + pageaddr = kmap_atomic(actor->page[i]); + memcpy(pageaddr, buf + pos, avail); + kunmap_atomic(pageaddr); + } + pos += avail; + } +} + +void squashfs_bh_to_actor(struct buffer_head **bh, int nr_buffers, + struct squashfs_page_actor *actor, int offset, int length, int blksz) +{ + void *kaddr = NULL; + int bytes = 0, pgoff = 0, b = 0, p = 0, avail, i; + + while (bytes < length) { + if (actor->page[p]) { + kaddr = kmap_atomic(actor->page[p]); + while (pgoff < PAGE_CACHE_SIZE && bytes < length) { + avail = min_t(int, blksz - offset, + PAGE_CACHE_SIZE - pgoff); + memcpy(kaddr + pgoff, bh[b]->b_data + offset, + avail); + pgoff += avail; + bytes += avail; + offset = (offset + avail) % blksz; + if (!offset) { + put_bh(bh[b]); + ++b; + } + } + kunmap_atomic(kaddr); + pgoff = 0; + } else { + for (i = 0; i < PAGE_CACHE_SIZE / blksz; ++i) { + if (bh[b]) + put_bh(bh[b]); + ++b; + } + bytes += PAGE_CACHE_SIZE; + } + ++p; + } +} + +void squashfs_bh_to_buf(struct buffer_head **bh, int nr_buffers, void *buf, + int offset, int length, int blksz) +{ + int i, avail, bytes = 0; + + for (i = 0; i < nr_buffers && bytes < length; ++i) { + avail = min_t(int, length - bytes, blksz - offset); + if (bh[i]) { + memcpy(buf + bytes, bh[i]->b_data + offset, avail); + put_bh(bh[i]); + } + bytes += avail; + offset = 0; + } +} + +void free_page_array(struct page **page, int nr_pages) +{ + int i; + + for (i = 0; i < nr_pages; ++i) + __free_page(page[i]); + kfree(page); +} + +struct page **alloc_page_array(int nr_pages, int gfp_mask) +{ + int i; + struct page **page; + + page = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); + if (!page) + return NULL; + for (i = 0; i < nr_pages; ++i) { + page[i] = alloc_page(gfp_mask); + if (!page[i]) { + free_page_array(page, i); + return NULL; + } + } + return page; +} diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index d2df0544e0df..aa1ed790b5a3 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -5,37 +5,61 @@ * Phillip Lougher * * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. + * the COPYING file in the top-level squashfsory. */ struct squashfs_page_actor { - union { - void **buffer; - struct page **page; - }; + struct page **page; void *pageaddr; - void *(*squashfs_first_page)(struct squashfs_page_actor *); - void *(*squashfs_next_page)(struct squashfs_page_actor *); - void (*squashfs_finish_page)(struct squashfs_page_actor *); int pages; int length; int next_page; + void (*release_pages)(struct page **, int, int); }; -extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); -extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page - **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init(struct page **, + int, int, void (*)(struct page **, int, int)); +extern void squashfs_page_actor_free(struct squashfs_page_actor *, int); + +extern void squashfs_actor_to_buf(struct squashfs_page_actor *, void *, int); +extern void squashfs_buf_to_actor(void *, struct squashfs_page_actor *, int); +extern void squashfs_bh_to_actor(struct buffer_head **, int, + struct squashfs_page_actor *, int, int, int); +extern void squashfs_bh_to_buf(struct buffer_head **, int, void *, int, int, + int); + +/* + * Calling code should avoid sleeping between calls to squashfs_first_page() + * and squashfs_finish_page(). + */ static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { - return actor->squashfs_first_page(actor); -} -static inline void *squashfs_next_page(struct squashfs_page_actor *actor) -{ - return actor->squashfs_next_page(actor); -} -static inline void squashfs_finish_page(struct squashfs_page_actor *actor) -{ - actor->squashfs_finish_page(actor); + actor->next_page = 1; + return actor->pageaddr = actor->page[0] ? kmap_atomic(actor->page[0]) + : NULL; } +static inline void *squashfs_next_page(struct squashfs_page_actor *actor) +{ + if (!IS_ERR_OR_NULL(actor->pageaddr)) + kunmap_atomic(actor->pageaddr); + + if (actor->next_page == actor->pages) + return actor->pageaddr = ERR_PTR(-ENODATA); + + actor->pageaddr = actor->page[actor->next_page] ? + kmap_atomic(actor->page[actor->next_page]) : NULL; + ++actor->next_page; + return actor->pageaddr; +} + +static inline void squashfs_finish_page(struct squashfs_page_actor *actor) +{ + if (!IS_ERR_OR_NULL(actor->pageaddr)) + kunmap_atomic(actor->pageaddr); +} + +extern struct page **alloc_page_array(int, int); +extern void free_page_array(struct page **, int); + #endif diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 1da565cb50c3..8a6995de0277 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -49,7 +49,7 @@ struct squashfs_cache_entry { int num_waiters; wait_queue_head_t wait_queue; struct squashfs_cache *cache; - void **data; + struct page **page; struct squashfs_page_actor *actor; }; diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index c609624e4b8a..14cd373e1897 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -55,7 +55,7 @@ static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk, struct comp_opts *opts; int err = 0, n; - opts = kmalloc(sizeof(*opts), GFP_KERNEL); + opts = kmalloc(sizeof(*opts), GFP_ATOMIC); if (opts == NULL) { err = -ENOMEM; goto out2; @@ -136,6 +136,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, enum xz_ret xz_err; int avail, total = 0, k = 0; struct squashfs_xz *stream = strm; + void *buf = NULL; xz_dec_reset(stream->state); stream->buf.in_pos = 0; @@ -156,12 +157,20 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->buf.out_pos == stream->buf.out_size) { stream->buf.out = squashfs_next_page(output); - if (stream->buf.out != NULL) { + if (!IS_ERR(stream->buf.out)) { stream->buf.out_pos = 0; total += PAGE_CACHE_SIZE; } } + if (!stream->buf.out) { + if (!buf) { + buf = kmalloc(PAGE_CACHE_SIZE, GFP_ATOMIC); + if (!buf) + goto out; + } + stream->buf.out = buf; + } xz_err = xz_dec_run(stream->state, &stream->buf); if (stream->buf.in_pos == stream->buf.in_size && k < b) @@ -173,11 +182,13 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, if (xz_err != XZ_STREAM_END || k < b) goto out; + kfree(buf); return total + stream->buf.out_pos; out: for (; k < b; k++) put_bh(bh[k]); + kfree(buf); return -EIO; } diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 8727caba6882..09c892b5308e 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -66,6 +66,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { + void *buf = NULL; int zlib_err, zlib_init = 0, k = 0; z_stream *stream = strm; @@ -84,10 +85,19 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->avail_out == 0) { stream->next_out = squashfs_next_page(output); - if (stream->next_out != NULL) + if (!IS_ERR(stream->next_out)) stream->avail_out = PAGE_CACHE_SIZE; } + if (!stream->next_out) { + if (!buf) { + buf = kmalloc(PAGE_CACHE_SIZE, GFP_ATOMIC); + if (!buf) + goto out; + } + stream->next_out = buf; + } + if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { @@ -115,11 +125,13 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (k < b) goto out; + kfree(buf); return stream->total_out; out: for (; k < b; k++) put_bh(bh[k]); + kfree(buf); return -EIO; } From c9994560db8e33d933a3e844b75e5ba2f40764db Mon Sep 17 00:00:00 2001 From: Adrien Schildknecht Date: Mon, 7 Nov 2016 12:37:55 -0800 Subject: [PATCH 0652/3220] Squashfs: replace buffer_head with BIO The 'll_rw_block' has been deprecated and BIO is now the basic container for block I/O within the kernel. Switching to BIO offers 2 advantages: 1/ It removes synchronous wait for the up-to-date buffers: SquashFS now deals with decompressions/copies asynchronously. Implementing an asynchronous mechanism to read data is needed to efficiently implement .readpages(). 2/ Prior to this patch, merging the read requests entirely depends on the IO scheduler. SquashFS has more information than the IO scheduler about what could be merged. Moreover, merging the reads at the FS level means that we rely less on the IO scheduler. Signed-off-by: Adrien Schildknecht Change-Id: I775d2e11f017476e1899518ab52d9d0a8a0bce28 --- fs/squashfs/block.c | 527 +++++++++++++++++++++++++++----------- fs/squashfs/file_direct.c | 204 +++++---------- fs/squashfs/squashfs.h | 6 + fs/squashfs/super.c | 7 + 4 files changed, 464 insertions(+), 280 deletions(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 0cea9b9236d0..8a75d812ffdf 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -28,9 +28,12 @@ #include #include +#include #include #include +#include #include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -38,45 +41,355 @@ #include "decompressor.h" #include "page_actor.h" -/* - * Read the metadata block length, this is stored in the first two - * bytes of the metadata block. - */ -static struct buffer_head *get_block_length(struct super_block *sb, - u64 *cur_index, int *offset, int *length) +static struct workqueue_struct *squashfs_read_wq; + +struct squashfs_read_request { + struct super_block *sb; + u64 index; + int length; + int compressed; + int offset; + u64 read_end; + struct squashfs_page_actor *output; + enum { + SQUASHFS_COPY, + SQUASHFS_DECOMPRESS, + SQUASHFS_METADATA, + } data_processing; + bool synchronous; + + /* + * If the read is synchronous, it is possible to retrieve information + * about the request by setting these pointers. + */ + int *res; + int *bytes_read; + int *bytes_uncompressed; + + int nr_buffers; + struct buffer_head **bh; + struct work_struct offload; +}; + +struct squashfs_bio_request { + struct buffer_head **bh; + int nr_buffers; +}; + +static int squashfs_bio_submit(struct squashfs_read_request *req); + +int squashfs_init_read_wq(void) { - struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head *bh; + squashfs_read_wq = create_workqueue("SquashFS read wq"); + return !!squashfs_read_wq; +} - bh = sb_bread(sb, *cur_index); - if (bh == NULL) - return NULL; +void squashfs_destroy_read_wq(void) +{ + flush_workqueue(squashfs_read_wq); + destroy_workqueue(squashfs_read_wq); +} - if (msblk->devblksize - *offset == 1) { - *length = (unsigned char) bh->b_data[*offset]; - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *length |= (unsigned char) bh->b_data[0] << 8; - *offset = 1; - } else { - *length = (unsigned char) bh->b_data[*offset] | - (unsigned char) bh->b_data[*offset + 1] << 8; - *offset += 2; +static void free_read_request(struct squashfs_read_request *req, int error) +{ + if (!req->synchronous) + squashfs_page_actor_free(req->output, error); + if (req->res) + *(req->res) = error; + kfree(req->bh); + kfree(req); +} - if (*offset == msblk->devblksize) { - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *offset = 0; +static void squashfs_process_blocks(struct squashfs_read_request *req) +{ + int error = 0; + int bytes, i, length; + struct squashfs_sb_info *msblk = req->sb->s_fs_info; + struct squashfs_page_actor *actor = req->output; + struct buffer_head **bh = req->bh; + int nr_buffers = req->nr_buffers; + + for (i = 0; i < nr_buffers; ++i) { + if (!bh[i]) + continue; + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + error = -EIO; + } + if (error) + goto cleanup; + + if (req->data_processing == SQUASHFS_METADATA) { + /* Extract the length of the metadata block */ + if (req->offset != msblk->devblksize - 1) + length = *((u16 *)(bh[0]->b_data + req->offset)); + else { + length = bh[0]->b_data[req->offset]; + length |= bh[1]->b_data[0] << 8; + } + req->compressed = SQUASHFS_COMPRESSED(length); + req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS + : SQUASHFS_COPY; + length = SQUASHFS_COMPRESSED_SIZE(length); + if (req->index + length + 2 > req->read_end) { + for (i = 0; i < nr_buffers; ++i) + put_bh(bh[i]); + kfree(bh); + req->length = length; + req->index += 2; + squashfs_bio_submit(req); + return; + } + req->length = length; + req->offset = (req->offset + 2) % PAGE_SIZE; + if (req->offset < 2) { + put_bh(bh[0]); + ++bh; + --nr_buffers; + } + } + if (req->bytes_read) + *(req->bytes_read) = req->length; + + if (req->data_processing == SQUASHFS_COPY) { + squashfs_bh_to_actor(bh, nr_buffers, req->output, req->offset, + req->length, msblk->devblksize); + } else if (req->data_processing == SQUASHFS_DECOMPRESS) { + req->length = squashfs_decompress(msblk, bh, nr_buffers, + req->offset, req->length, actor); + if (req->length < 0) { + error = -EIO; + goto cleanup; } } - return bh; + /* Last page may have trailing bytes not filled */ + bytes = req->length % PAGE_SIZE; + if (bytes && actor->page[actor->pages - 1]) + zero_user_segment(actor->page[actor->pages - 1], bytes, + PAGE_SIZE); + +cleanup: + if (req->bytes_uncompressed) + *(req->bytes_uncompressed) = req->length; + if (error) { + for (i = 0; i < nr_buffers; ++i) + if (bh[i]) + put_bh(bh[i]); + } + free_read_request(req, error); } +static void read_wq_handler(struct work_struct *work) +{ + squashfs_process_blocks(container_of(work, + struct squashfs_read_request, offload)); +} + +static void squashfs_bio_end_io(struct bio *bio) +{ + int i; + int error = bio->bi_error; + struct squashfs_bio_request *bio_req = bio->bi_private; + + bio_put(bio); + + for (i = 0; i < bio_req->nr_buffers; ++i) { + if (!bio_req->bh[i]) + continue; + if (!error) + set_buffer_uptodate(bio_req->bh[i]); + else + clear_buffer_uptodate(bio_req->bh[i]); + unlock_buffer(bio_req->bh[i]); + } + kfree(bio_req); +} + +static int actor_getblks(struct squashfs_read_request *req, u64 block) +{ + int i; + + req->bh = kmalloc_array(req->nr_buffers, sizeof(*(req->bh)), GFP_NOIO); + if (!req->bh) + return -ENOMEM; + + for (i = 0; i < req->nr_buffers; ++i) { + req->bh[i] = sb_getblk(req->sb, block + i); + if (!req->bh[i]) { + while (--i) { + if (req->bh[i]) + put_bh(req->bh[i]); + } + return -1; + } + } + return 0; +} + +static int squashfs_bio_submit(struct squashfs_read_request *req) +{ + struct bio *bio = NULL; + struct buffer_head *bh; + struct squashfs_bio_request *bio_req = NULL; + int b = 0, prev_block = 0; + struct squashfs_sb_info *msblk = req->sb->s_fs_info; + + u64 read_start = round_down(req->index, msblk->devblksize); + u64 read_end = round_up(req->index + req->length, msblk->devblksize); + sector_t block = read_start >> msblk->devblksize_log2; + sector_t block_end = read_end >> msblk->devblksize_log2; + int offset = read_start - round_down(req->index, PAGE_SIZE); + int nr_buffers = block_end - block; + int blksz = msblk->devblksize; + int bio_max_pages = nr_buffers > BIO_MAX_PAGES ? BIO_MAX_PAGES + : nr_buffers; + + /* Setup the request */ + req->read_end = read_end; + req->offset = req->index - read_start; + req->nr_buffers = nr_buffers; + if (actor_getblks(req, block) < 0) + goto getblk_failed; + + /* Create and submit the BIOs */ + for (b = 0; b < nr_buffers; ++b, offset += blksz) { + bh = req->bh[b]; + if (!bh || !trylock_buffer(bh)) + continue; + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + offset %= PAGE_SIZE; + + /* Append the buffer to the current BIO if it is contiguous */ + if (bio && bio_req && prev_block + 1 == b) { + if (bio_add_page(bio, bh->b_page, blksz, offset)) { + bio_req->nr_buffers += 1; + prev_block = b; + continue; + } + } + + /* Otherwise, submit the current BIO and create a new one */ + if (bio) + submit_bio(READ, bio); + bio_req = kcalloc(1, sizeof(struct squashfs_bio_request), + GFP_NOIO); + if (!bio_req) + goto req_alloc_failed; + bio_req->bh = &req->bh[b]; + bio = bio_alloc(GFP_NOIO, bio_max_pages); + if (!bio) + goto bio_alloc_failed; + bio->bi_bdev = req->sb->s_bdev; + bio->bi_iter.bi_sector = (block + b) + << (msblk->devblksize_log2 - 9); + bio->bi_private = bio_req; + bio->bi_end_io = squashfs_bio_end_io; + + bio_add_page(bio, bh->b_page, blksz, offset); + bio_req->nr_buffers += 1; + prev_block = b; + } + if (bio) + submit_bio(READ, bio); + + if (req->synchronous) + squashfs_process_blocks(req); + else { + INIT_WORK(&req->offload, read_wq_handler); + schedule_work(&req->offload); + } + return 0; + +bio_alloc_failed: + kfree(bio_req); +req_alloc_failed: + unlock_buffer(bh); + while (--nr_buffers >= b) + if (req->bh[nr_buffers]) + put_bh(req->bh[nr_buffers]); + while (--b >= 0) + if (req->bh[b]) + wait_on_buffer(req->bh[b]); +getblk_failed: + free_read_request(req, -ENOMEM); + return -ENOMEM; +} + +static int read_metadata_block(struct squashfs_read_request *req, + u64 *next_index) +{ + int ret, error, bytes_read = 0, bytes_uncompressed = 0; + struct squashfs_sb_info *msblk = req->sb->s_fs_info; + + if (req->index + 2 > msblk->bytes_used) { + free_read_request(req, -EINVAL); + return -EINVAL; + } + req->length = 2; + + /* Do not read beyond the end of the device */ + if (req->index + req->length > msblk->bytes_used) + req->length = msblk->bytes_used - req->index; + req->data_processing = SQUASHFS_METADATA; + + /* + * Reading metadata is always synchronous because we don't know the + * length in advance and the function is expected to update + * 'next_index' and return the length. + */ + req->synchronous = true; + req->res = &error; + req->bytes_read = &bytes_read; + req->bytes_uncompressed = &bytes_uncompressed; + + TRACE("Metadata block @ 0x%llx, %scompressed size %d, src size %d\n", + req->index, req->compressed ? "" : "un", bytes_read, + req->output->length); + + ret = squashfs_bio_submit(req); + if (ret) + return ret; + if (error) + return error; + if (next_index) + *next_index += 2 + bytes_read; + return bytes_uncompressed; +} + +static int read_data_block(struct squashfs_read_request *req, int length, + u64 *next_index, bool synchronous) +{ + int ret, error = 0, bytes_uncompressed = 0, bytes_read = 0; + + req->compressed = SQUASHFS_COMPRESSED_BLOCK(length); + req->length = length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); + req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS + : SQUASHFS_COPY; + + req->synchronous = synchronous; + if (synchronous) { + req->res = &error; + req->bytes_read = &bytes_read; + req->bytes_uncompressed = &bytes_uncompressed; + } + + TRACE("Data block @ 0x%llx, %scompressed size %d, src size %d\n", + req->index, req->compressed ? "" : "un", req->length, + req->output->length); + + ret = squashfs_bio_submit(req); + if (ret) + return ret; + if (synchronous) + ret = error ? error : bytes_uncompressed; + if (next_index) + *next_index += length; + return ret; +} /* * Read and decompress a metadata block or datablock. Length is non-zero @@ -87,128 +400,50 @@ static struct buffer_head *get_block_length(struct super_block *sb, * generated a larger block - this does occasionally happen with compression * algorithms). */ -int squashfs_read_data(struct super_block *sb, u64 index, int length, - u64 *next_index, struct squashfs_page_actor *output) +static int __squashfs_read_data(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output, bool sync) { - struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head **bh; - int offset = index & ((1 << msblk->devblksize_log2) - 1); - u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, avail, i; + struct squashfs_read_request *req; - bh = kcalloc(((output->length + msblk->devblksize - 1) - >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); - if (bh == NULL) + req = kcalloc(1, sizeof(struct squashfs_read_request), GFP_KERNEL); + if (!req) { + if (!sync) + squashfs_page_actor_free(output, -ENOMEM); return -ENOMEM; - - if (length) { - /* - * Datablock. - */ - bytes = -offset; - compressed = SQUASHFS_COMPRESSED_BLOCK(length); - length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); - if (next_index) - *next_index = index + length; - - TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", - index, compressed ? "" : "un", length, output->length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto read_failure; - - for (b = 0; bytes < length; b++, cur_index++) { - bh[b] = sb_getblk(sb, cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(READ, b, bh); - } else { - /* - * Metadata block. - */ - if ((index + 2) > msblk->bytes_used) - goto read_failure; - - bh[0] = get_block_length(sb, &cur_index, &offset, &length); - if (bh[0] == NULL) - goto read_failure; - b = 1; - - bytes = msblk->devblksize - offset; - compressed = SQUASHFS_COMPRESSED(length); - length = SQUASHFS_COMPRESSED_SIZE(length); - if (next_index) - *next_index = index + length + 2; - - TRACE("Block @ 0x%llx, %scompressed size %d\n", index, - compressed ? "" : "un", length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto block_release; - - for (; bytes < length; b++) { - bh[b] = sb_getblk(sb, ++cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(READ, b - 1, bh + 1); } - for (i = 0; i < b; i++) { - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - goto block_release; + req->sb = sb; + req->index = index; + req->output = output; + + if (next_index) + *next_index = index; + + if (length) + length = read_data_block(req, length, next_index, sync); + else + length = read_metadata_block(req, next_index); + + if (length < 0) { + ERROR("squashfs_read_data failed to read block 0x%llx\n", + (unsigned long long)index); + return -EIO; } - if (compressed) { - length = squashfs_decompress(msblk, bh, b, offset, length, - output); - if (length < 0) - goto read_failure; - } else { - /* - * Block is uncompressed. - */ - int in, pg_offset = 0; - void *data = squashfs_first_page(output); - - for (bytes = length; k < b; k++) { - in = min(bytes, msblk->devblksize - offset); - bytes -= in; - while (in) { - if (pg_offset == PAGE_CACHE_SIZE) { - data = squashfs_next_page(output); - pg_offset = 0; - } - avail = min_t(int, in, PAGE_CACHE_SIZE - - pg_offset); - memcpy(data + pg_offset, bh[k]->b_data + offset, - avail); - in -= avail; - pg_offset += avail; - offset += avail; - } - offset = 0; - put_bh(bh[k]); - } - squashfs_finish_page(output); - } - - kfree(bh); return length; - -block_release: - for (; k < b; k++) - put_bh(bh[k]); - -read_failure: - ERROR("squashfs_read_data failed to read block 0x%llx\n", - (unsigned long long) index); - kfree(bh); - return -EIO; +} + +int squashfs_read_data(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output) +{ + return __squashfs_read_data(sb, index, length, next_index, output, + true); +} + +int squashfs_read_data_async(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output) +{ + + return __squashfs_read_data(sb, index, length, next_index, output, + false); } diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 9d033ec0f133..10fe1272535f 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -20,8 +20,68 @@ #include "squashfs.h" #include "page_actor.h" -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page); +static void release_actor_pages(struct page **page, int pages, int error) +{ + int i; + + for (i = 0; i < pages; i++) { + if (!page[i]) + continue; + flush_dcache_page(page[i]); + if (!error) + SetPageUptodate(page[i]); + else { + SetPageError(page[i]); + zero_user_segment(page[i], 0, PAGE_CACHE_SIZE); + } + unlock_page(page[i]); + put_page(page[i]); + } + kfree(page); +} + +/* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ +static struct squashfs_page_actor *actor_from_page_cache( + struct page *target_page, int start_index, int nr_pages) +{ + int i, n; + struct page **page; + struct squashfs_page_actor *actor; + + page = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); + if (!page) + return NULL; + + /* Try to grab all the pages covered by the SquashFS block */ + for (i = 0, n = start_index; i < nr_pages; i++, n++) { + if (target_page->index == n) { + page[i] = target_page; + } else { + page[i] = grab_cache_page_nowait(target_page->mapping, + n); + if (page[i] == NULL) + continue; + } + + if (PageUptodate(page[i])) { + unlock_page(page[i]); + put_page(page[i]); + page[i] = NULL; + } + } + + actor = squashfs_page_actor_init(page, nr_pages, 0, + release_actor_pages); + if (!actor) { + release_actor_pages(page, nr_pages, -ENOMEM); + kfree(page); + return NULL; + } + return actor; +} /* Read separately compressed datablock directly into page cache */ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) @@ -34,143 +94,19 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; int start_index = target_page->index & ~mask; int end_index = start_index | mask; - int i, n, pages, missing_pages, bytes, res = -ENOMEM; - struct page **page; + int pages, res = -ENOMEM; struct squashfs_page_actor *actor; - void *pageaddr; if (end_index > file_end) end_index = file_end; - pages = end_index - start_index + 1; - page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); - if (page == NULL) - return res; + actor = actor_from_page_cache(target_page, start_index, pages); + if (!actor) + return -ENOMEM; - /* - * Create a "page actor" which will kmap and kunmap the - * page cache pages appropriately within the decompressor - */ - actor = squashfs_page_actor_init(page, pages, 0, NULL); - if (actor == NULL) - goto out; - - /* Try to grab all the pages covered by the Squashfs block */ - for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { - page[i] = (n == target_page->index) ? target_page : - grab_cache_page_nowait(target_page->mapping, n); - - if (page[i] == NULL) { - missing_pages++; - continue; - } - - if (PageUptodate(page[i])) { - unlock_page(page[i]); - page_cache_release(page[i]); - page[i] = NULL; - missing_pages++; - } - } - - if (missing_pages) { - /* - * Couldn't get one or more pages, this page has either - * been VM reclaimed, but others are still in the page cache - * and uptodate, or we're racing with another thread in - * squashfs_readpage also trying to grab them. Fall back to - * using an intermediate buffer. - */ - res = squashfs_read_cache(target_page, block, bsize, pages, - page); - if (res < 0) - goto mark_errored; - - goto out; - } - - /* Decompress directly into the page cache buffers */ - res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - if (res < 0) - goto mark_errored; - - /* Last page may have trailing bytes not filled */ - bytes = res % PAGE_CACHE_SIZE; - if (bytes) { - pageaddr = kmap_atomic(page[pages - 1]); - memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); - kunmap_atomic(pageaddr); - } - - /* Mark pages as uptodate, unlock and release */ - for (i = 0; i < pages; i++) { - flush_dcache_page(page[i]); - SetPageUptodate(page[i]); - unlock_page(page[i]); - if (page[i] != target_page) - page_cache_release(page[i]); - } - - kfree(actor); - kfree(page); - - return 0; - -mark_errored: - /* Decompression failed, mark pages as errored. Target_page is - * dealt with by the caller - */ - for (i = 0; i < pages; i++) { - if (page[i] == NULL || page[i] == target_page) - continue; - flush_dcache_page(page[i]); - SetPageError(page[i]); - unlock_page(page[i]); - page_cache_release(page[i]); - } - -out: - squashfs_page_actor_free(actor, 0); - kfree(page); - return res; -} - - -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page) -{ - struct inode *i = target_page->mapping->host; - struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, - block, bsize); - int bytes = buffer->length, res = buffer->error, n, offset = 0; - void *pageaddr; - - if (res) { - ERROR("Unable to read page, block %llx, size %x\n", block, - bsize); - goto out; - } - - for (n = 0; n < pages && bytes > 0; n++, - bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { - int avail = min_t(int, bytes, PAGE_CACHE_SIZE); - - if (page[n] == NULL) - continue; - - pageaddr = kmap_atomic(page[n]); - squashfs_copy_data(pageaddr, buffer, offset, avail); - memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); - kunmap_atomic(pageaddr); - flush_dcache_page(page[n]); - SetPageUptodate(page[n]); - unlock_page(page[n]); - if (page[n] != target_page) - page_cache_release(page[n]); - } - -out: - squashfs_cache_put(buffer); - return res; + get_page(target_page); + res = squashfs_read_data_async(inode->i_sb, block, bsize, NULL, + actor); + return res < 0 ? res : 0; } diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 887d6d270080..985317fbf6c2 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -28,8 +28,14 @@ #define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args) /* block.c */ +extern int squashfs_init_read_wq(void); +extern void squashfs_destroy_read_wq(void); extern int squashfs_read_data(struct super_block *, u64, int, u64 *, struct squashfs_page_actor *); +extern int squashfs_read_data(struct super_block *, u64, int, u64 *, + struct squashfs_page_actor *); +extern int squashfs_read_data_async(struct super_block *, u64, int, u64 *, + struct squashfs_page_actor *); /* cache.c */ extern struct squashfs_cache *squashfs_cache_init(char *, int, int); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 5056babe00df..61cd0b39ed0e 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -444,9 +444,15 @@ static int __init init_squashfs_fs(void) if (err) return err; + if (!squashfs_init_read_wq()) { + destroy_inodecache(); + return -ENOMEM; + } + err = register_filesystem(&squashfs_fs_type); if (err) { destroy_inodecache(); + squashfs_destroy_read_wq(); return err; } @@ -460,6 +466,7 @@ static void __exit exit_squashfs_fs(void) { unregister_filesystem(&squashfs_fs_type); destroy_inodecache(); + squashfs_destroy_read_wq(); } From 5e9c466d6ec02a6d43ab7ff595c90b10bd0eada1 Mon Sep 17 00:00:00 2001 From: Adrien Schildknecht Date: Mon, 7 Nov 2016 12:41:42 -0800 Subject: [PATCH 0653/3220] Squashfs: implement .readpages() Squashfs does not implement .readpages(), so the kernel just repeatedly calls .readpage(). The readpages function tries to pack as much pages as possible in the same page actor so that only 1 read request is issued. Now that the read requests are asynchronous, the kernel can truly prefetch pages using its readahead algorithm. Signed-off-by: Adrien Schildknecht Change-Id: Ice70e029dc24526f61e4e5a1a902588be2212498 --- fs/squashfs/file.c | 138 ++++++++++++++++++++++++++++---------- fs/squashfs/file_direct.c | 64 ++++++++++++------ fs/squashfs/squashfs.h | 5 +- 3 files changed, 151 insertions(+), 56 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e5c9689062ba..6f5ef8d7e55a 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -47,12 +47,16 @@ #include #include #include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +// Backported from 4.5 +#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) + /* * Locate cache slot in range [offset, index] for specified inode. If * there's more than one return the slot closest to index. @@ -438,6 +442,21 @@ static int squashfs_readpage_fragment(struct page *page) return res; } +static int squashfs_readpages_fragment(struct page *page, + struct list_head *readahead_pages, struct address_space *mapping) +{ + if (!page) { + page = lru_to_page(readahead_pages); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, page->index, + mapping_gfp_constraint(mapping, GFP_KERNEL))) { + put_page(page); + return 0; + } + } + return squashfs_readpage_fragment(page); +} + static int squashfs_readpage_sparse(struct page *page, int index, int file_end) { struct inode *inode = page->mapping->host; @@ -450,54 +469,105 @@ static int squashfs_readpage_sparse(struct page *page, int index, int file_end) return 0; } -static int squashfs_readpage(struct file *file, struct page *page) +static int squashfs_readpages_sparse(struct page *page, + struct list_head *readahead_pages, int index, int file_end, + struct address_space *mapping) { - struct inode *inode = page->mapping->host; + if (!page) { + page = lru_to_page(readahead_pages); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, page->index, + mapping_gfp_constraint(mapping, GFP_KERNEL))) { + put_page(page); + return 0; + } + } + return squashfs_readpage_sparse(page, index, file_end); +} + +static int __squashfs_readpages(struct file *file, struct page *page, + struct list_head *readahead_pages, unsigned int nr_pages, + struct address_space *mapping) +{ + struct inode *inode = mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); int file_end = i_size_read(inode) >> msblk->block_log; int res; - void *pageaddr; + + do { + struct page *cur_page = page ? page + : lru_to_page(readahead_pages); + int page_index = cur_page->index; + int index = page_index >> (msblk->block_log - PAGE_CACHE_SHIFT); + + if (page_index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) + return 1; + + if (index < file_end || squashfs_i(inode)->fragment_block == + SQUASHFS_INVALID_BLK) { + u64 block = 0; + int bsize = read_blocklist(inode, index, &block); + + if (bsize < 0) + return -1; + + if (bsize == 0) { + res = squashfs_readpages_sparse(page, + readahead_pages, index, file_end, + mapping); + } else { + res = squashfs_readpages_block(page, + readahead_pages, &nr_pages, mapping, + page_index, block, bsize); + } + } else { + res = squashfs_readpages_fragment(page, + readahead_pages, mapping); + } + if (res) + return 0; + page = NULL; + } while (readahead_pages && !list_empty(readahead_pages)); + + return 0; +} + +static int squashfs_readpage(struct file *file, struct page *page) +{ + int ret; TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", - page->index, squashfs_i(inode)->start); + page->index, squashfs_i(page->mapping->host)->start); - if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) - goto out; + get_page(page); - if (index < file_end || squashfs_i(inode)->fragment_block == - SQUASHFS_INVALID_BLK) { - u64 block = 0; - int bsize = read_blocklist(inode, index, &block); - if (bsize < 0) - goto error_out; - - if (bsize == 0) - res = squashfs_readpage_sparse(page, index, file_end); + ret = __squashfs_readpages(file, page, NULL, 1, page->mapping); + if (ret) { + flush_dcache_page(page); + if (ret < 0) + SetPageError(page); else - res = squashfs_readpage_block(page, block, bsize); - } else - res = squashfs_readpage_fragment(page); + SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + put_page(page); + } - if (!res) - return 0; - -error_out: - SetPageError(page); -out: - pageaddr = kmap_atomic(page); - memset(pageaddr, 0, PAGE_CACHE_SIZE); - kunmap_atomic(pageaddr); - flush_dcache_page(page); - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); + return 0; +} +static int squashfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned int nr_pages) +{ + TRACE("Entered squashfs_readpages, %u pages, first page index %lx\n", + nr_pages, lru_to_page(pages)->index); + __squashfs_readpages(file, NULL, pages, nr_pages, mapping); return 0; } const struct address_space_operations squashfs_aops = { - .readpage = squashfs_readpage + .readpage = squashfs_readpage, + .readpages = squashfs_readpages, }; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 10fe1272535f..3fb4ce210edb 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -20,6 +21,9 @@ #include "squashfs.h" #include "page_actor.h" +// Backported from 4.5 +#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) + static void release_actor_pages(struct page **page, int pages, int error) { int i; @@ -45,23 +49,40 @@ static void release_actor_pages(struct page **page, int pages, int error) * page cache pages appropriately within the decompressor */ static struct squashfs_page_actor *actor_from_page_cache( - struct page *target_page, int start_index, int nr_pages) + unsigned int actor_pages, struct page *target_page, + struct list_head *rpages, unsigned int *nr_pages, int start_index, + struct address_space *mapping) { - int i, n; struct page **page; struct squashfs_page_actor *actor; + int i, n; + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); - page = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); + page = kmalloc_array(actor_pages, sizeof(void *), GFP_KERNEL); if (!page) return NULL; - /* Try to grab all the pages covered by the SquashFS block */ - for (i = 0, n = start_index; i < nr_pages; i++, n++) { - if (target_page->index == n) { + for (i = 0, n = start_index; i < actor_pages; i++, n++) { + if (target_page == NULL && rpages && !list_empty(rpages)) { + struct page *cur_page = lru_to_page(rpages); + + if (cur_page->index < start_index + actor_pages) { + list_del(&cur_page->lru); + --(*nr_pages); + if (add_to_page_cache_lru(cur_page, mapping, + cur_page->index, gfp)) + put_page(cur_page); + else + target_page = cur_page; + } else + rpages = NULL; + } + + if (target_page && target_page->index == n) { page[i] = target_page; + target_page = NULL; } else { - page[i] = grab_cache_page_nowait(target_page->mapping, - n); + page[i] = grab_cache_page_nowait(mapping, n); if (page[i] == NULL) continue; } @@ -73,39 +94,42 @@ static struct squashfs_page_actor *actor_from_page_cache( } } - actor = squashfs_page_actor_init(page, nr_pages, 0, + actor = squashfs_page_actor_init(page, actor_pages, 0, release_actor_pages); if (!actor) { - release_actor_pages(page, nr_pages, -ENOMEM); + release_actor_pages(page, actor_pages, -ENOMEM); kfree(page); return NULL; } return actor; } -/* Read separately compressed datablock directly into page cache */ -int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) +int squashfs_readpages_block(struct page *target_page, + struct list_head *readahead_pages, + unsigned int *nr_pages, + struct address_space *mapping, + int page_index, u64 block, int bsize) { - struct inode *inode = target_page->mapping->host; + struct squashfs_page_actor *actor; + struct inode *inode = mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; - int start_index = target_page->index & ~mask; + int start_index = page_index & ~mask; int end_index = start_index | mask; - int pages, res = -ENOMEM; - struct squashfs_page_actor *actor; + int actor_pages, res; if (end_index > file_end) end_index = file_end; - pages = end_index - start_index + 1; + actor_pages = end_index - start_index + 1; - actor = actor_from_page_cache(target_page, start_index, pages); + actor = actor_from_page_cache(actor_pages, target_page, + readahead_pages, nr_pages, start_index, + mapping); if (!actor) return -ENOMEM; - get_page(target_page); res = squashfs_read_data_async(inode->i_sb, block, bsize, NULL, actor); return res < 0 ? res : 0; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 985317fbf6c2..6093579c6c5d 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -76,8 +76,9 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *, void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int, int); -/* file_xxx.c */ -extern int squashfs_readpage_block(struct page *, u64, int); +/* file_direct.c */ +extern int squashfs_readpages_block(struct page *, struct list_head *, + unsigned int *, struct address_space *, int, u64, int); /* id.c */ extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); From d9aa8ddc51cbd7d3bc68d664f9b71548a04447e9 Mon Sep 17 00:00:00 2001 From: Adrien Schildknecht Date: Thu, 29 Sep 2016 15:25:30 -0700 Subject: [PATCH 0654/3220] Squashfs: optimize reading uncompressed data When dealing with uncompressed data, there is no need to read a whole block (default 128K) to get the desired page: the pages are independent from each others. This patch change the readpages logic so that reading uncompressed data only read the number of pages advised by the readahead algorithm. Moreover, if the page actor contains holes (i.e. pages that are already up-to-date), squashfs skips the buffer_head associated to those pages. This patch greatly improve the performance of random reads for uncompressed files because squashfs only read what is needed. It also reduces the number of unnecessary reads. Signed-off-by: Adrien Schildknecht Change-Id: I1850150fbf4b45c9dd138d88409fea1ab44054c0 --- fs/squashfs/block.c | 25 +++++++++++++++++++++++++ fs/squashfs/file_direct.c | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 8a75d812ffdf..2eb66decc5ab 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -206,6 +206,22 @@ static void squashfs_bio_end_io(struct bio *bio) kfree(bio_req); } +static int bh_is_optional(struct squashfs_read_request *req, int idx) +{ + int start_idx, end_idx; + struct squashfs_sb_info *msblk = req->sb->s_fs_info; + + start_idx = (idx * msblk->devblksize - req->offset) / PAGE_CACHE_SIZE; + end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) / PAGE_CACHE_SIZE; + if (start_idx >= req->output->pages) + return 1; + if (start_idx < 0) + start_idx = end_idx; + if (end_idx >= req->output->pages) + end_idx = start_idx; + return !req->output->page[start_idx] && !req->output->page[end_idx]; +} + static int actor_getblks(struct squashfs_read_request *req, u64 block) { int i; @@ -215,6 +231,15 @@ static int actor_getblks(struct squashfs_read_request *req, u64 block) return -ENOMEM; for (i = 0; i < req->nr_buffers; ++i) { + /* + * When dealing with an uncompressed block, the actor may + * contains NULL pages. There's no need to read the buffers + * associated with these pages. + */ + if (!req->compressed && bh_is_optional(req, i)) { + req->bh[i] = NULL; + continue; + } req->bh[i] = sb_getblk(req->sb, block + i); if (!req->bh[i]) { while (--i) { diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 3fb4ce210edb..c97af4c6ccd0 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -114,15 +114,38 @@ int squashfs_readpages_block(struct page *target_page, struct squashfs_page_actor *actor; struct inode *inode = mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + int start_index, end_index, file_end, actor_pages, res; int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; - int start_index = page_index & ~mask; - int end_index = start_index | mask; - int actor_pages, res; - if (end_index > file_end) - end_index = file_end; - actor_pages = end_index - start_index + 1; + /* + * If readpage() is called on an uncompressed datablock, we can just + * read the pages instead of fetching the whole block. + * This greatly improves the performance when a process keep doing + * random reads because we only fetch the necessary data. + * The readahead algorithm will take care of doing speculative reads + * if necessary. + * We can't read more than 1 block even if readahead provides use more + * pages because we don't know yet if the next block is compressed or + * not. + */ + if (bsize && !SQUASHFS_COMPRESSED_BLOCK(bsize)) { + u64 block_end = block + msblk->block_size; + + block += (page_index & mask) * PAGE_CACHE_SIZE; + actor_pages = (block_end - block) / PAGE_CACHE_SIZE; + if (*nr_pages < actor_pages) + actor_pages = *nr_pages; + start_index = page_index; + bsize = min_t(int, bsize, (PAGE_CACHE_SIZE * actor_pages) + | SQUASHFS_COMPRESSED_BIT_BLOCK); + } else { + file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + start_index = page_index & ~mask; + end_index = start_index | mask; + if (end_index > file_end) + end_index = file_end; + actor_pages = end_index - start_index + 1; + } actor = actor_from_page_cache(actor_pages, target_page, readahead_pages, nr_pages, start_index, From 11fac3aed57f1ef25755fbdb167c31cc322e2b09 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 9 Feb 2017 19:38:57 -0800 Subject: [PATCH 0655/3220] ANDROID: export security_path_chown BUG: 35142419 Change-Id: I05a9430a3c1bc624e019055175ad377290b4e774 Signed-off-by: Daniel Rosenberg --- security/security.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/security.c b/security/security.c index 46f405ce6b0f..ae05ab153c5a 100644 --- a/security/security.c +++ b/security/security.c @@ -498,6 +498,7 @@ int security_path_chown(struct path *path, kuid_t uid, kgid_t gid) return 0; return call_int_hook(path_chown, 0, path, uid, gid); } +EXPORT_SYMBOL(security_path_chown); int security_path_chroot(struct path *path) { From d854b688907b34fcab97fc3b58000084255ee53a Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Fri, 3 Feb 2017 15:48:03 -0800 Subject: [PATCH 0656/3220] ANDROID: Refactor fs readpage/write tracepoints. Refactor the fs readpage/write tracepoints to move the inode->path lookup outside the tracepoint code, and pass a pointer to the path into the tracepoint code instead. This is necessary because the tracepoint code runs non-preemptible. Thanks to Trilok Soni for catching this in 4.4. Change-Id: I7486c5947918d155a30c61d6b9cd5027cf8fbe15 Signed-off-by: Mohan Srinivasan --- fs/ext4/inline.c | 12 ++++- fs/ext4/inode.c | 53 ++++++++++++++++------ fs/ext4/readpage.c | 6 +++ fs/f2fs/data.c | 40 ++++++++++++---- fs/f2fs/inline.c | 13 ++++-- fs/mpage.c | 6 +++ include/trace/events/android_fs.h | 44 ++++++++++++++++-- include/trace/events/android_fs_template.h | 34 +++----------- 8 files changed, 147 insertions(+), 61 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index af34979684a4..3c0ee824f512 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -501,8 +501,16 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) return -EAGAIN; } - trace_android_fs_dataread_start(inode, page_offset(page), PAGE_SIZE, - current->pid, current->comm); + if (trace_android_fs_dataread_start_enabled()) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + inode); + trace_android_fs_dataread_start(inode, page_offset(page), + PAGE_SIZE, current->pid, + path, current->comm); + } /* * Current inline data can only exist in the 1st page, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a7208a662ee6..07037262337a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -983,8 +983,16 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - trace_android_fs_datawrite_start(inode, pos, len, - current->pid, current->comm); + if (trace_android_fs_datawrite_start_enabled()) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + inode); + trace_android_fs_datawrite_start(inode, pos, len, + current->pid, path, + current->comm); + } trace_ext4_write_begin(inode, pos, len, flags); /* * Reserve one block more for addition to orphan list in case @@ -2675,8 +2683,16 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, len, flags, pagep, fsdata); } *fsdata = (void *)0; - trace_android_fs_datawrite_start(inode, pos, len, - current->pid, current->comm); + if (trace_android_fs_datawrite_start_enabled()) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + inode); + trace_android_fs_datawrite_start(inode, pos, len, + current->pid, + path, current->comm); + } trace_ext4_da_write_begin(inode, pos, len, flags); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -3285,16 +3301,27 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return 0; if (trace_android_fs_dataread_start_enabled() && - (iov_iter_rw(iter) == READ)) - trace_android_fs_dataread_start(inode, offset, count, - current->pid, - current->comm); - if (trace_android_fs_datawrite_start_enabled() && - (iov_iter_rw(iter) == WRITE)) - trace_android_fs_datawrite_start(inode, offset, count, - current->pid, - current->comm); + (iov_iter_rw(iter) == READ)) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + inode); + trace_android_fs_dataread_start(inode, offset, count, + current->pid, path, + current->comm); + } + if (trace_android_fs_datawrite_start_enabled() && + (iov_iter_rw(iter) == WRITE)) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + inode); + trace_android_fs_datawrite_start(inode, offset, count, + current->pid, path, + current->comm); + } trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(iocb, iter, offset); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 1ce24a6759a0..1c5db9fd9c8f 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -152,11 +152,17 @@ ext4_submit_bio_read(struct bio *bio) struct page *first_page = bio->bi_io_vec[0].bv_page; if (first_page != NULL) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + first_page->mapping->host); trace_android_fs_dataread_start( first_page->mapping->host, page_offset(first_page), bio->bi_iter.bi_size, current->pid, + path, current->comm); } } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e692958d6e78..8936044dee4c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1402,8 +1402,16 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct dnode_of_data dn; int err = 0; - trace_android_fs_datawrite_start(inode, pos, len, - current->pid, current->comm); + if (trace_android_fs_datawrite_start_enabled()) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + inode); + trace_android_fs_datawrite_start(inode, pos, len, + current->pid, path, + current->comm); + } trace_f2fs_write_begin(inode, pos, len, flags); f2fs_balance_fs(sbi); @@ -1587,15 +1595,27 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (trace_android_fs_dataread_start_enabled() && - (iov_iter_rw(iter) == READ)) - trace_android_fs_dataread_start(inode, offset, - count, current->pid, - current->comm); - if (trace_android_fs_datawrite_start_enabled() && - (iov_iter_rw(iter) == WRITE)) - trace_android_fs_datawrite_start(inode, offset, count, - current->pid, current->comm); + (iov_iter_rw(iter) == READ)) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + inode); + trace_android_fs_dataread_start(inode, offset, + count, current->pid, path, + current->comm); + } + if (trace_android_fs_datawrite_start_enabled() && + (iov_iter_rw(iter) == WRITE)) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + inode); + trace_android_fs_datawrite_start(inode, offset, count, + current->pid, path, + current->comm); + } if (iov_iter_rw(iter) == WRITE) { __allocate_data_blocks(inode, offset, count); if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index d2c5d69ba0b1..dbb2cc4df603 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -85,9 +85,16 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) { struct page *ipage; - trace_android_fs_dataread_start(inode, page_offset(page), - PAGE_SIZE, current->pid, - current->comm); + if (trace_android_fs_dataread_start_enabled()) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + inode); + trace_android_fs_dataread_start(inode, page_offset(page), + PAGE_SIZE, current->pid, + path, current->comm); + } ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { diff --git a/fs/mpage.c b/fs/mpage.c index 5c65d8942692..0fd48fdcc1b1 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -79,11 +79,17 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio) struct page *first_page = bio->bi_io_vec[0].bv_page; if (first_page != NULL) { + char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; + + path = android_fstrace_get_pathname(pathbuf, + MAX_TRACE_PATHBUF_LEN, + first_page->mapping->host); trace_android_fs_dataread_start( first_page->mapping->host, page_offset(first_page), bio->bi_iter.bi_size, current->pid, + path, current->comm); } } diff --git a/include/trace/events/android_fs.h b/include/trace/events/android_fs.h index 531da433a7bc..49509533d3fa 100644 --- a/include/trace/events/android_fs.h +++ b/include/trace/events/android_fs.h @@ -9,8 +9,8 @@ DEFINE_EVENT(android_fs_data_start_template, android_fs_dataread_start, TP_PROTO(struct inode *inode, loff_t offset, int bytes, - pid_t pid, char *command), - TP_ARGS(inode, offset, bytes, pid, command)); + pid_t pid, char *pathname, char *command), + TP_ARGS(inode, offset, bytes, pid, pathname, command)); DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end, TP_PROTO(struct inode *inode, loff_t offset, int bytes), @@ -18,14 +18,48 @@ DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end, DEFINE_EVENT(android_fs_data_start_template, android_fs_datawrite_start, TP_PROTO(struct inode *inode, loff_t offset, int bytes, - pid_t pid, char *command), - TP_ARGS(inode, offset, bytes, pid, command)); + pid_t pid, char *pathname, char *command), + TP_ARGS(inode, offset, bytes, pid, pathname, command)); DEFINE_EVENT(android_fs_data_end_template, android_fs_datawrite_end, TP_PROTO(struct inode *inode, loff_t offset, int bytes), - TP_ARGS(inode, offset, bytes)); + TP_ARGS(inode, offset, bytes)); #endif /* _TRACE_ANDROID_FS_H */ /* This part must be outside protection */ #include + +#ifndef ANDROID_FSTRACE_GET_PATHNAME +#define ANDROID_FSTRACE_GET_PATHNAME + +/* Sizes an on-stack array, so careful if sizing this up ! */ +#define MAX_TRACE_PATHBUF_LEN 256 + +static inline char * +android_fstrace_get_pathname(char *buf, int buflen, struct inode *inode) +{ + char *path; + struct dentry *d; + + /* + * d_obtain_alias() will either iput() if it locates an existing + * dentry or transfer the reference to the new dentry created. + * So get an extra reference here. + */ + ihold(inode); + d = d_obtain_alias(inode); + if (likely(!IS_ERR(d))) { + path = dentry_path_raw(d, buf, buflen); + if (unlikely(IS_ERR(path))) { + strcpy(buf, "ERROR"); + path = buf; + } + dput(d); + } else { + strcpy(buf, "ERROR"); + path = buf; + } + return path; +} +#endif diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h index 618988b047c1..4e61ffe7a814 100644 --- a/include/trace/events/android_fs_template.h +++ b/include/trace/events/android_fs_template.h @@ -5,11 +5,10 @@ DECLARE_EVENT_CLASS(android_fs_data_start_template, TP_PROTO(struct inode *inode, loff_t offset, int bytes, - pid_t pid, char *command), - TP_ARGS(inode, offset, bytes, pid, command), + pid_t pid, char *pathname, char *command), + TP_ARGS(inode, offset, bytes, pid, pathname, command), TP_STRUCT__entry( - __array(char, path, MAX_FILTER_STR_VAL); - __field(char *, pathname); + __string(pathbuf, pathname); __field(loff_t, offset); __field(int, bytes); __field(loff_t, i_size); @@ -19,27 +18,7 @@ DECLARE_EVENT_CLASS(android_fs_data_start_template, ), TP_fast_assign( { - struct dentry *d; - - /* - * Grab a reference to the inode here because - * d_obtain_alias() will either drop the inode - * reference if it locates an existing dentry - * or transfer the reference to the new dentry - * created. In our case, the file is still open, - * so the dentry is guaranteed to exist (connected), - * so d_obtain_alias() drops the reference we - * grabbed here. - */ - ihold(inode); - d = d_obtain_alias(inode); - if (!IS_ERR(d)) { - __entry->pathname = dentry_path(d, - __entry->path, - MAX_FILTER_STR_VAL); - dput(d); - } else - __entry->pathname = ERR_PTR(-EINVAL); + __assign_str(pathbuf, pathname); __entry->offset = offset; __entry->bytes = bytes; __entry->i_size = i_size_read(inode); @@ -50,9 +29,8 @@ DECLARE_EVENT_CLASS(android_fs_data_start_template, ), TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," " pid %d, i_size %llu, ino %lu", - (IS_ERR(__entry->pathname) ? "ERROR" : __entry->pathname), - __entry->offset, __entry->bytes, __get_str(cmdline), - __entry->pid, __entry->i_size, + __get_str(pathbuf), __entry->offset, __entry->bytes, + __get_str(cmdline), __entry->pid, __entry->i_size, (unsigned long) __entry->ino) ); From 15227d3ccce3bcac1bd797a7428f1e3305bd9d6c Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Fri, 12 Feb 2016 09:47:52 -0600 Subject: [PATCH 0657/3220] UPSTREAM: arm/arm64: crypto: assure that ECB modes don't require an IV ECB modes don't use an initialization vector. The kernel /proc/crypto interface doesn't reflect this properly. Acked-by: Ard Biesheuvel Signed-off-by: Jeremy Linton Signed-off-by: Will Deacon (cherry picked from bee038a4bd2efe8188cc80dfdad706a9abe568ad) Signed-off-by: Eric Biggers Change-Id: Ief9558d2b41be58a2d845d2033a141b5ef7b585f --- arch/arm/crypto/aes-ce-glue.c | 4 ++-- arch/arm64/crypto/aes-glue.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c index b445a5d56f43..89a3a3e592d6 100644 --- a/arch/arm/crypto/aes-ce-glue.c +++ b/arch/arm/crypto/aes-ce-glue.c @@ -364,7 +364,7 @@ static struct crypto_alg aes_algs[] = { { .cra_blkcipher = { .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, + .ivsize = 0, .setkey = ce_aes_setkey, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, @@ -441,7 +441,7 @@ static struct crypto_alg aes_algs[] = { { .cra_ablkcipher = { .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, + .ivsize = 0, .setkey = ablk_set_key, .encrypt = ablk_encrypt, .decrypt = ablk_decrypt, diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 05d9e16c0dfd..7a3d22a46faf 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -294,7 +294,7 @@ static struct crypto_alg aes_algs[] = { { .cra_blkcipher = { .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, + .ivsize = 0, .setkey = aes_setkey, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, @@ -371,7 +371,7 @@ static struct crypto_alg aes_algs[] = { { .cra_ablkcipher = { .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, + .ivsize = 0, .setkey = ablk_set_key, .encrypt = ablk_encrypt, .decrypt = ablk_decrypt, From 93867d9bc5c10fca7d6e75b57a8a4b8171a8dfe8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 10 Jan 2017 16:47:49 -0800 Subject: [PATCH 0658/3220] ANDROID: crypto: allow blkcipher walks over ablkcipher data Add a function blkcipher_ablkcipher_walk_virt() which allows ablkcipher algorithms to use the blkcipher_walk API to walk over their data. This will be used by the HEH algorithm, which to support asynchronous ECB algorithms will be an ablkcipher, but it also needs to make other passes over the data. Bug: 32975945 Signed-off-by: Eric Biggers Change-Id: I05f9a0e5473ba6115fcc72d5122d6b0b18b2078b --- crypto/blkcipher.c | 21 +++++++++++++++++++++ include/crypto/algapi.h | 3 +++ 2 files changed, 24 insertions(+) diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c index 8cc1622b2ee0..dae7194bceb9 100644 --- a/crypto/blkcipher.c +++ b/crypto/blkcipher.c @@ -372,6 +372,27 @@ int blkcipher_aead_walk_virt_block(struct blkcipher_desc *desc, } EXPORT_SYMBOL_GPL(blkcipher_aead_walk_virt_block); +/* + * This function allows ablkcipher algorithms to use the blkcipher_walk API to + * walk over their data. The specified crypto_ablkcipher tfm is used to + * initialize the struct blkcipher_walk, and the crypto_blkcipher specified in + * desc->tfm is never used so it can be left NULL. (Yes, this design is ugly, + * but it parallels blkcipher_aead_walk_virt_block() above. In the 4.10 kernel + * this is starting to be cleaned up...) + */ +int blkcipher_ablkcipher_walk_virt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk, + struct crypto_ablkcipher *tfm) +{ + walk->flags &= ~BLKCIPHER_WALK_PHYS; + walk->walk_blocksize = crypto_ablkcipher_blocksize(tfm); + walk->cipher_blocksize = walk->walk_blocksize; + walk->ivsize = crypto_ablkcipher_ivsize(tfm); + walk->alignmask = crypto_ablkcipher_alignmask(tfm); + return blkcipher_walk_first(desc, walk); +} +EXPORT_SYMBOL_GPL(blkcipher_ablkcipher_walk_virt); + static int setkey_unaligned(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index c9fe145f7dd3..04661e1fb625 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -202,6 +202,9 @@ int blkcipher_aead_walk_virt_block(struct blkcipher_desc *desc, struct blkcipher_walk *walk, struct crypto_aead *tfm, unsigned int blocksize); +int blkcipher_ablkcipher_walk_virt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk, + struct crypto_ablkcipher *tfm); int ablkcipher_walk_done(struct ablkcipher_request *req, struct ablkcipher_walk *walk, int err); From c8bb10b1eec8156fe3efd1543b6431c32434ba28 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 10 Jan 2017 16:47:49 -0800 Subject: [PATCH 0659/3220] ANDROID: crypto: shash - Add crypto_grab_shash() and crypto_spawn_shash_alg() Analogous to crypto_grab_skcipher() and crypto_spawn_skcipher_alg(), these are useful for algorithms that need to use a shash sub-algorithm, possibly in addition to other sub-algorithms. Bug: 32975945 Signed-off-by: Eric Biggers Change-Id: I44e5a519d73f5f839e3b6ecbf8c66e36ec569557 --- crypto/shash.c | 8 ++++++++ include/crypto/internal/hash.h | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/crypto/shash.c b/crypto/shash.c index ecb1e3d39bf0..c10d16373b87 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -682,6 +682,14 @@ void shash_free_instance(struct crypto_instance *inst) } EXPORT_SYMBOL_GPL(shash_free_instance); +int crypto_grab_shash(struct crypto_shash_spawn *spawn, + const char *name, u32 type, u32 mask) +{ + spawn->base.frontend = &crypto_shash_type; + return crypto_grab_spawn(&spawn->base, name, type, mask); +} +EXPORT_SYMBOL_GPL(crypto_grab_shash); + int crypto_init_shash_spawn(struct crypto_shash_spawn *spawn, struct shash_alg *alg, struct crypto_instance *inst) diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h index 3b4af1d7c7e9..476d99d0edb7 100644 --- a/include/crypto/internal/hash.h +++ b/include/crypto/internal/hash.h @@ -102,6 +102,8 @@ int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst); void shash_free_instance(struct crypto_instance *inst); +int crypto_grab_shash(struct crypto_shash_spawn *spawn, + const char *name, u32 type, u32 mask); int crypto_init_shash_spawn(struct crypto_shash_spawn *spawn, struct shash_alg *alg, struct crypto_instance *inst); @@ -111,6 +113,12 @@ static inline void crypto_drop_shash(struct crypto_shash_spawn *spawn) crypto_drop_spawn(&spawn->base); } +static inline struct shash_alg *crypto_spawn_shash_alg( + struct crypto_shash_spawn *spawn) +{ + return container_of(spawn->base.alg, struct shash_alg, base); +} + struct shash_alg *shash_attr_alg(struct rtattr *rta, u32 type, u32 mask); int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc); From 8ea7531e4764805981be72a2b1eed3617af24a38 Mon Sep 17 00:00:00 2001 From: Alex Cope Date: Mon, 14 Nov 2016 11:02:54 -0800 Subject: [PATCH 0660/3220] UPSTREAM: crypto: gf128mul - Zero memory when freeing multiplication table GF(2^128) multiplication tables are typically used for secret information, so it's a good idea to zero them on free. Signed-off-by: Alex Cope Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu (cherry-picked from 75aa0a7cafe951538c7cb7c5ed457a3371ec5bcd) Bug: 32975945 Signed-off-by: Eric Biggers Change-Id: I37b1ae9544158007f9ee2caf070120f4a42153ab --- crypto/gf128mul.c | 4 ++-- include/crypto/gf128mul.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c index 5276607c72d0..0594dd6f82f2 100644 --- a/crypto/gf128mul.c +++ b/crypto/gf128mul.c @@ -352,8 +352,8 @@ void gf128mul_free_64k(struct gf128mul_64k *t) int i; for (i = 0; i < 16; i++) - kfree(t->t[i]); - kfree(t); + kzfree(t->t[i]); + kzfree(t); } EXPORT_SYMBOL(gf128mul_free_64k); diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h index da2530e34b26..7217fe6dbe33 100644 --- a/include/crypto/gf128mul.h +++ b/include/crypto/gf128mul.h @@ -177,7 +177,7 @@ void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t); static inline void gf128mul_free_4k(struct gf128mul_4k *t) { - kfree(t); + kzfree(t); } From 3eaf06b785a603e0258af10ef1f98d8a511874a8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 10 Jan 2017 17:04:54 -0800 Subject: [PATCH 0661/3220] ANDROID: crypto: gf128mul - Refactor gf128 overflow macros and tables Rename and clean up the GF(2^128) overflow macros and tables. Their usage is more general than the name suggested, e.g. what was previously known as the "bbe" table can actually be used for both "bbe" and "ble" multiplication. Bug: 32975945 Signed-off-by: Eric Biggers Change-Id: Ie6c47b4075ca40031eb1767e9b468cfd7bf1b2e4 --- crypto/gf128mul.c | 68 ++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c index 0594dd6f82f2..8b65b1eb5dda 100644 --- a/crypto/gf128mul.c +++ b/crypto/gf128mul.c @@ -88,33 +88,47 @@ q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \ } -/* Given the value i in 0..255 as the byte overflow when a field element - in GHASH is multiplied by x^8, this function will return the values that - are generated in the lo 16-bit word of the field value by applying the - modular polynomial. The values lo_byte and hi_byte are returned via the - macro xp_fun(lo_byte, hi_byte) so that the values can be assembled into - memory as required by a suitable definition of this macro operating on - the table above -*/ +/* + * Given a value i in 0..255 as the byte overflow when a field element + * in GF(2^128) is multiplied by x^8, the following macro returns the + * 16-bit value that must be XOR-ed into the low-degree end of the + * product to reduce it modulo the irreducible polynomial x^128 + x^7 + + * x^2 + x + 1. + * + * There are two versions of the macro, and hence two tables: one for + * the "be" convention where the highest-order bit is the coefficient of + * the highest-degree polynomial term, and one for the "le" convention + * where the highest-order bit is the coefficient of the lowest-degree + * polynomial term. In both cases the values are stored in CPU byte + * endianness such that the coefficients are ordered consistently across + * bytes, i.e. in the "be" table bits 15..0 of the stored value + * correspond to the coefficients of x^15..x^0, and in the "le" table + * bits 15..0 correspond to the coefficients of x^0..x^15. + * + * Therefore, provided that the appropriate byte endianness conversions + * are done by the multiplication functions (and these must be in place + * anyway to support both little endian and big endian CPUs), the "be" + * table can be used for multiplications of both "bbe" and "ble" + * elements, and the "le" table can be used for multiplications of both + * "lle" and "lbe" elements. + */ -#define xx(p, q) 0x##p##q - -#define xda_bbe(i) ( \ - (i & 0x80 ? xx(43, 80) : 0) ^ (i & 0x40 ? xx(21, c0) : 0) ^ \ - (i & 0x20 ? xx(10, e0) : 0) ^ (i & 0x10 ? xx(08, 70) : 0) ^ \ - (i & 0x08 ? xx(04, 38) : 0) ^ (i & 0x04 ? xx(02, 1c) : 0) ^ \ - (i & 0x02 ? xx(01, 0e) : 0) ^ (i & 0x01 ? xx(00, 87) : 0) \ +#define xda_be(i) ( \ + (i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \ + (i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \ + (i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \ + (i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \ ) -#define xda_lle(i) ( \ - (i & 0x80 ? xx(e1, 00) : 0) ^ (i & 0x40 ? xx(70, 80) : 0) ^ \ - (i & 0x20 ? xx(38, 40) : 0) ^ (i & 0x10 ? xx(1c, 20) : 0) ^ \ - (i & 0x08 ? xx(0e, 10) : 0) ^ (i & 0x04 ? xx(07, 08) : 0) ^ \ - (i & 0x02 ? xx(03, 84) : 0) ^ (i & 0x01 ? xx(01, c2) : 0) \ +#define xda_le(i) ( \ + (i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \ + (i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \ + (i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \ + (i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \ ) -static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle); -static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe); +static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le); +static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be); /* These functions multiply a field element by x, by x^4 and by x^8 * in the polynomial field representation. It uses 32-bit word operations @@ -126,7 +140,7 @@ static void gf128mul_x_lle(be128 *r, const be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); - u64 _tt = gf128mul_table_lle[(b << 7) & 0xff]; + u64 _tt = gf128mul_table_le[(b << 7) & 0xff]; r->b = cpu_to_be64((b >> 1) | (a << 63)); r->a = cpu_to_be64((a >> 1) ^ (_tt << 48)); @@ -136,7 +150,7 @@ static void gf128mul_x_bbe(be128 *r, const be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); - u64 _tt = gf128mul_table_bbe[a >> 63]; + u64 _tt = gf128mul_table_be[a >> 63]; r->a = cpu_to_be64((a << 1) | (b >> 63)); r->b = cpu_to_be64((b << 1) ^ _tt); @@ -146,7 +160,7 @@ void gf128mul_x_ble(be128 *r, const be128 *x) { u64 a = le64_to_cpu(x->a); u64 b = le64_to_cpu(x->b); - u64 _tt = gf128mul_table_bbe[b >> 63]; + u64 _tt = gf128mul_table_be[b >> 63]; r->a = cpu_to_le64((a << 1) ^ _tt); r->b = cpu_to_le64((b << 1) | (a >> 63)); @@ -157,7 +171,7 @@ static void gf128mul_x8_lle(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); - u64 _tt = gf128mul_table_lle[b & 0xff]; + u64 _tt = gf128mul_table_le[b & 0xff]; x->b = cpu_to_be64((b >> 8) | (a << 56)); x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); @@ -167,7 +181,7 @@ static void gf128mul_x8_bbe(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); - u64 _tt = gf128mul_table_bbe[a >> 56]; + u64 _tt = gf128mul_table_be[a >> 56]; x->a = cpu_to_be64((a << 8) | (b >> 56)); x->b = cpu_to_be64((b << 8) ^ _tt); From ce2ace45d931f46e79e8e3c2e857c083e67be554 Mon Sep 17 00:00:00 2001 From: Alex Cope Date: Tue, 10 Jan 2017 16:47:49 -0800 Subject: [PATCH 0662/3220] ANDROID: crypto: gf128mul - Add ble multiplication functions Adding ble multiplication to GF128mul, and fixing up comments. The ble multiplication functions multiply GF(2^128) elements in the ble format. This format is preferable because the bits within each byte map to polynomial coefficients in the natural order (lowest order bit = coefficient of lowest degree polynomial term), and the bytes are stored in little endian order which matches the endianness of most modern CPUs. These new functions will be used by the HEH algorithm. Signed-off-by: Alex Cope Bug: 32975945 Signed-off-by: Eric Biggers Change-Id: I39a58e8ee83e6f9b2e6bd51738f816dbfa2f3a47 --- crypto/gf128mul.c | 99 ++++++++++++++++++++++++++++++++++++--- include/crypto/gf128mul.h | 45 +++++++++--------- 2 files changed, 117 insertions(+), 27 deletions(-) diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c index 8b65b1eb5dda..f3d9f6da0767 100644 --- a/crypto/gf128mul.c +++ b/crypto/gf128mul.c @@ -44,7 +44,7 @@ --------------------------------------------------------------------------- Issue 31/01/2006 - This file provides fast multiplication in GF(128) as required by several + This file provides fast multiplication in GF(2^128) as required by several cryptographic authentication modes */ @@ -130,9 +130,10 @@ static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le); static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be); -/* These functions multiply a field element by x, by x^4 and by x^8 - * in the polynomial field representation. It uses 32-bit word operations - * to gain speed but compensates for machine endianess and hence works +/* + * The following functions multiply a field element by x or by x^8 in + * the polynomial field representation. They use 64-bit word operations + * to gain speed but compensate for machine endianness and hence work * correctly on both styles of machine. */ @@ -187,6 +188,16 @@ static void gf128mul_x8_bbe(be128 *x) x->b = cpu_to_be64((b << 8) ^ _tt); } +static void gf128mul_x8_ble(be128 *x) +{ + u64 a = le64_to_cpu(x->b); + u64 b = le64_to_cpu(x->a); + u64 _tt = gf128mul_table_be[a >> 56]; + + x->b = cpu_to_le64((a << 8) | (b >> 56)); + x->a = cpu_to_le64((b << 8) ^ _tt); +} + void gf128mul_lle(be128 *r, const be128 *b) { be128 p[8]; @@ -263,9 +274,48 @@ void gf128mul_bbe(be128 *r, const be128 *b) } EXPORT_SYMBOL(gf128mul_bbe); +void gf128mul_ble(be128 *r, const be128 *b) +{ + be128 p[8]; + int i; + + p[0] = *r; + for (i = 0; i < 7; ++i) + gf128mul_x_ble((be128 *)&p[i + 1], (be128 *)&p[i]); + + memset(r, 0, sizeof(*r)); + for (i = 0;;) { + u8 ch = ((u8 *)b)[15 - i]; + + if (ch & 0x80) + be128_xor(r, r, &p[7]); + if (ch & 0x40) + be128_xor(r, r, &p[6]); + if (ch & 0x20) + be128_xor(r, r, &p[5]); + if (ch & 0x10) + be128_xor(r, r, &p[4]); + if (ch & 0x08) + be128_xor(r, r, &p[3]); + if (ch & 0x04) + be128_xor(r, r, &p[2]); + if (ch & 0x02) + be128_xor(r, r, &p[1]); + if (ch & 0x01) + be128_xor(r, r, &p[0]); + + if (++i >= 16) + break; + + gf128mul_x8_ble(r); + } +} +EXPORT_SYMBOL(gf128mul_ble); + + /* This version uses 64k bytes of table space. A 16 byte buffer has to be multiplied by a 16 byte key - value in GF(128). If we consider a GF(128) value in + value in GF(2^128). If we consider a GF(2^128) value in the buffer's lowest byte, we can construct a table of the 256 16 byte values that result from the 256 values of this byte. This requires 4096 bytes. But we also @@ -399,7 +449,7 @@ EXPORT_SYMBOL(gf128mul_64k_bbe); /* This version uses 4k bytes of table space. A 16 byte buffer has to be multiplied by a 16 byte key - value in GF(128). If we consider a GF(128) value in a + value in GF(2^128). If we consider a GF(2^128) value in a single byte, we can construct a table of the 256 16 byte values that result from the 256 values of this byte. This requires 4096 bytes. If we take the highest byte in @@ -457,6 +507,28 @@ out: } EXPORT_SYMBOL(gf128mul_init_4k_bbe); +struct gf128mul_4k *gf128mul_init_4k_ble(const be128 *g) +{ + struct gf128mul_4k *t; + int j, k; + + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + goto out; + + t->t[1] = *g; + for (j = 1; j <= 64; j <<= 1) + gf128mul_x_ble(&t->t[j + j], &t->t[j]); + + for (j = 2; j < 256; j += j) + for (k = 1; k < j; ++k) + be128_xor(&t->t[j + k], &t->t[j], &t->t[k]); + +out: + return t; +} +EXPORT_SYMBOL(gf128mul_init_4k_ble); + void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t) { u8 *ap = (u8 *)a; @@ -487,5 +559,20 @@ void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t) } EXPORT_SYMBOL(gf128mul_4k_bbe); +void gf128mul_4k_ble(be128 *a, struct gf128mul_4k *t) +{ + u8 *ap = (u8 *)a; + be128 r[1]; + int i = 15; + + *r = t->t[ap[15]]; + while (i--) { + gf128mul_x8_ble(r); + be128_xor(r, r, &t->t[ap[i]]); + } + *a = *r; +} +EXPORT_SYMBOL(gf128mul_4k_ble); + MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)"); diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h index 7217fe6dbe33..230760aef93b 100644 --- a/include/crypto/gf128mul.h +++ b/include/crypto/gf128mul.h @@ -43,7 +43,7 @@ --------------------------------------------------------------------------- Issue Date: 31/01/2006 - An implementation of field multiplication in Galois Field GF(128) + An implementation of field multiplication in Galois Field GF(2^128) */ #ifndef _CRYPTO_GF128MUL_H @@ -65,7 +65,7 @@ * are left and the lsb's are right. char b[16] is an array and b[0] is * the first octet. * - * 80000000 00000000 00000000 00000000 .... 00000000 00000000 00000000 + * 10000000 00000000 00000000 00000000 .... 00000000 00000000 00000000 * b[0] b[1] b[2] b[3] b[13] b[14] b[15] * * Every bit is a coefficient of some power of X. We can store the bits @@ -99,21 +99,21 @@ * * bbe on a little endian machine u32 x[4]: * - * MS x[0] LS MS x[1] LS + * MS x[0] LS MS x[1] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 103..96 111.104 119.112 127.120 71...64 79...72 87...80 95...88 * - * MS x[2] LS MS x[3] LS + * MS x[2] LS MS x[3] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 39...32 47...40 55...48 63...56 07...00 15...08 23...16 31...24 * * ble on a little endian machine * - * MS x[0] LS MS x[1] LS + * MS x[0] LS MS x[1] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 31...24 23...16 15...08 07...00 63...56 55...48 47...40 39...32 * - * MS x[2] LS MS x[3] LS + * MS x[2] LS MS x[3] LS * ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls * 95...88 87...80 79...72 71...64 127.120 199.112 111.104 103..96 * @@ -127,7 +127,7 @@ * machines this will automatically aligned to wordsize and on a 64-bit * machine also. */ -/* Multiply a GF128 field element by x. Field elements are held in arrays +/* Multiply a GF128 field element by x. Field elements are held in arrays of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower indexed bits placed in the more numerically significant bit positions within bytes. @@ -135,45 +135,47 @@ On little endian machines the bit indexes translate into the bit positions within four 32-bit words in the following way - MS x[0] LS MS x[1] LS + MS x[0] LS MS x[1] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 24...31 16...23 08...15 00...07 56...63 48...55 40...47 32...39 - MS x[2] LS MS x[3] LS + MS x[2] LS MS x[3] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 88...95 80...87 72...79 64...71 120.127 112.119 104.111 96..103 On big endian machines the bit indexes translate into the bit positions within four 32-bit words in the following way - MS x[0] LS MS x[1] LS + MS x[0] LS MS x[1] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 00...07 08...15 16...23 24...31 32...39 40...47 48...55 56...63 - MS x[2] LS MS x[3] LS + MS x[2] LS MS x[3] LS ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls 64...71 72...79 80...87 88...95 96..103 104.111 112.119 120.127 */ -/* A slow generic version of gf_mul, implemented for lle and bbe - * It multiplies a and b and puts the result in a */ +/* A slow generic version of gf_mul, implemented for lle, bbe, and ble. + * It multiplies a and b and puts the result in a + */ void gf128mul_lle(be128 *a, const be128 *b); - void gf128mul_bbe(be128 *a, const be128 *b); +void gf128mul_ble(be128 *a, const be128 *b); -/* multiply by x in ble format, needed by XTS */ +/* multiply by x in ble format, needed by XTS and HEH */ void gf128mul_x_ble(be128 *a, const be128 *b); /* 4k table optimization */ - struct gf128mul_4k { be128 t[256]; }; struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g); struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g); +struct gf128mul_4k *gf128mul_init_4k_ble(const be128 *g); void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t); void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t); +void gf128mul_4k_ble(be128 *a, struct gf128mul_4k *t); static inline void gf128mul_free_4k(struct gf128mul_4k *t) { @@ -181,16 +183,17 @@ static inline void gf128mul_free_4k(struct gf128mul_4k *t) } -/* 64k table optimization, implemented for lle and bbe */ +/* 64k table optimization, implemented for lle, ble, and bbe */ struct gf128mul_64k { struct gf128mul_4k *t[16]; }; -/* first initialize with the constant factor with which you - * want to multiply and then call gf128_64k_lle with the other - * factor in the first argument, the table in the second and a - * scratch register in the third. Afterwards *a = *r. */ +/* First initialize with the constant factor with which you + * want to multiply and then call gf128mul_64k_bbe with the other + * factor in the first argument, and the table in the second. + * Afterwards, the result is stored in *a. + */ struct gf128mul_64k *gf128mul_init_64k_lle(const be128 *g); struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g); void gf128mul_free_64k(struct gf128mul_64k *t); From 698ffc03b70134f4f4af89bf64f3bcb96e358545 Mon Sep 17 00:00:00 2001 From: Alex Cope Date: Tue, 10 Jan 2017 16:47:49 -0800 Subject: [PATCH 0663/3220] ANDROID: crypto: heh - Add Hash-Encrypt-Hash (HEH) algorithm Hash-Encrypt-Hash (HEH) is a proposed block cipher mode of operation which extends the strong pseudo-random permutation property of block ciphers (e.g. AES) to arbitrary length input strings. This provides a stronger notion of security than existing block cipher modes of operation (e.g. CBC, CTR, XTS), though it is usually less performant. It uses two keyed invertible hash functions with a layer of ECB encryption applied in-between. The algorithm is currently specified by the following Internet Draft: https://tools.ietf.org/html/draft-cope-heh-01 This patch adds HEH as a symmetric cipher only. Support for HEH as an AEAD is not yet implemented. HEH will use an existing accelerated ecb(block_cipher) implementation for the encrypt step if available. Accelerated versions of the hash step are planned but will be left for later patches. This patch backports HEH to the 4.4 Android kernel, initially for use by ext4 filenames encryption. Note that HEH is not yet upstream; however, patches have been made available on linux-crypto, and as noted there is also a draft specification available. This backport required updating the code to conform to the legacy ablkcipher API rather than the skcipher API, which wasn't complete in 4.4. Signed-off-by: Alex Cope Bug: 32975945 Signed-off-by: Eric Biggers Change-Id: I945bcc9c0115916824d701bae91b86e3f059a1a9 --- crypto/Kconfig | 17 + crypto/Makefile | 1 + crypto/heh.c | 899 +++++++++++++++++++++++++++++++++++++++++++++++ crypto/testmgr.c | 15 + crypto/testmgr.h | 194 ++++++++++ 5 files changed, 1126 insertions(+) create mode 100644 crypto/heh.c diff --git a/crypto/Kconfig b/crypto/Kconfig index 7240821137fd..627227f1162d 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -289,6 +289,23 @@ config CRYPTO_CBC CBC: Cipher Block Chaining mode This block cipher algorithm is required for IPSec. +config CRYPTO_HEH + tristate "HEH support" + select CRYPTO_CMAC + select CRYPTO_ECB + select CRYPTO_GF128MUL + select CRYPTO_MANAGER + help + HEH: Hash-Encrypt-Hash mode + HEH is a proposed block cipher mode of operation which extends the + strong pseudo-random permutation (SPRP) property of block ciphers to + arbitrary-length input strings. This provides a stronger notion of + security than existing block cipher modes of operation (e.g. CBC, CTR, + XTS), though it is usually less performant. Applications include disk + encryption and encryption of file names and contents. Currently, this + implementation only provides a symmetric cipher interface, so it can't + yet be used as an AEAD. + config CRYPTO_CTR tristate "CTR support" select CRYPTO_BLKCIPHER diff --git a/crypto/Makefile b/crypto/Makefile index f7aba923458d..3d36c5a6a5ea 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o obj-$(CONFIG_CRYPTO_ECB) += ecb.o obj-$(CONFIG_CRYPTO_CBC) += cbc.o +obj-$(CONFIG_CRYPTO_HEH) += heh.o obj-$(CONFIG_CRYPTO_PCBC) += pcbc.o obj-$(CONFIG_CRYPTO_CTS) += cts.o obj-$(CONFIG_CRYPTO_LRW) += lrw.o diff --git a/crypto/heh.c b/crypto/heh.c new file mode 100644 index 000000000000..48a284cecaa2 --- /dev/null +++ b/crypto/heh.c @@ -0,0 +1,899 @@ +/* + * HEH: Hash-Encrypt-Hash mode + * + * Copyright (c) 2016 Google Inc. + * + * Authors: + * Alex Cope + * Eric Biggers + */ + +/* + * Hash-Encrypt-Hash (HEH) is a proposed block cipher mode of operation which + * extends the strong pseudo-random permutation (SPRP) property of block ciphers + * (e.g. AES) to arbitrary length input strings. It uses two keyed invertible + * hash functions with a layer of ECB encryption applied in-between. The + * algorithm is specified by the following Internet Draft: + * + * https://tools.ietf.org/html/draft-cope-heh-01 + * + * Although HEH can be used as either a regular symmetric cipher or as an AEAD, + * currently this module only provides it as a symmetric cipher. Additionally, + * only 16-byte nonces are supported. + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * The block size is the size of GF(2^128) elements and also the required block + * size of the underlying block cipher. + */ +#define HEH_BLOCK_SIZE 16 + +struct heh_instance_ctx { + struct crypto_shash_spawn cmac; + struct crypto_skcipher_spawn ecb; +}; + +struct heh_tfm_ctx { + struct crypto_shash *cmac; + struct crypto_ablkcipher *ecb; + struct gf128mul_4k *tau_key; +}; + +struct heh_cmac_data { + u8 nonce[HEH_BLOCK_SIZE]; + __le32 nonce_length; + __le32 aad_length; + __le32 message_length; + __le32 padding; +}; + +struct heh_req_ctx { /* aligned to alignmask */ + be128 beta1_key; + be128 beta2_key; + union { + struct { + struct heh_cmac_data data; + struct shash_desc desc; + /* + crypto_shash_descsize(cmac) */ + } cmac; + struct { + u8 keystream[HEH_BLOCK_SIZE]; + u8 tmp[HEH_BLOCK_SIZE]; + struct scatterlist tmp_sgl[2]; + struct ablkcipher_request req; + /* + crypto_ablkcipher_reqsize(ecb) */ + } ecb; + } u; +}; + +/* + * Get the offset in bytes to the last full block, or equivalently the length of + * all full blocks excluding the last + */ +static inline unsigned int get_tail_offset(unsigned int len) +{ + len -= len % HEH_BLOCK_SIZE; + return len - HEH_BLOCK_SIZE; +} + +static inline struct heh_req_ctx *heh_req_ctx(struct ablkcipher_request *req) +{ + unsigned int alignmask = crypto_ablkcipher_alignmask( + crypto_ablkcipher_reqtfm(req)); + + return (void *)PTR_ALIGN((u8 *)ablkcipher_request_ctx(req), + alignmask + 1); +} + +static inline void async_done(struct crypto_async_request *areq, int err, + int (*next_step)(struct ablkcipher_request *, + u32)) +{ + struct ablkcipher_request *req = areq->data; + + if (err) + goto out; + + err = next_step(req, req->base.flags & ~CRYPTO_TFM_REQ_MAY_SLEEP); + if (err == -EINPROGRESS || + (err == -EBUSY && (req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))) + return; +out: + ablkcipher_request_complete(req, err); +} + +/* + * Generate the per-message "beta" keys used by the hashing layers of HEH. The + * first beta key is the CMAC of the nonce, the additional authenticated data + * (AAD), and the lengths in bytes of the nonce, AAD, and message. The nonce + * and AAD are each zero-padded to the next 16-byte block boundary, and the + * lengths are serialized as 4-byte little endian integers and zero-padded to + * the next 16-byte block boundary. + * The second beta key is the first one interpreted as an element in GF(2^128) + * and multiplied by x. + * + * Note that because the nonce and AAD may, in general, be variable-length, the + * key generation must be done by a pseudo-random function (PRF) on + * variable-length inputs. CBC-MAC does not satisfy this, as it is only a PRF + * on fixed-length inputs. CMAC remedies this flaw. Including the lengths of + * the nonce, AAD, and message is also critical to avoid collisions. + * + * That being said, this implementation does not yet operate as an AEAD and + * therefore there is never any AAD, nor are variable-length nonces supported. + */ +static int generate_betas(struct ablkcipher_request *req, + be128 *beta1_key, be128 *beta2_key) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct heh_req_ctx *rctx = heh_req_ctx(req); + struct heh_cmac_data *data = &rctx->u.cmac.data; + struct shash_desc *desc = &rctx->u.cmac.desc; + int err; + + BUILD_BUG_ON(sizeof(*data) != 2 * HEH_BLOCK_SIZE); + memcpy(data->nonce, req->info, HEH_BLOCK_SIZE); + data->nonce_length = cpu_to_le32(HEH_BLOCK_SIZE); + data->aad_length = cpu_to_le32(0); + data->message_length = cpu_to_le32(req->nbytes); + data->padding = cpu_to_le32(0); + + desc->tfm = ctx->cmac; + desc->flags = req->base.flags; + + err = crypto_shash_digest(desc, (const u8 *)data, sizeof(*data), + (u8 *)beta1_key); + if (err) + return err; + + gf128mul_x_ble(beta2_key, beta1_key); + return 0; +} + +/* + * Evaluation of a polynomial over GF(2^128) using Horner's rule. The + * polynomial is evaluated at 'point'. The polynomial's coefficients are taken + * from 'coeffs_sgl' and are for terms with consecutive descending degree ending + * at degree 1. 'bytes_of_coeffs' is 16 times the number of terms. + */ +static be128 evaluate_polynomial(struct gf128mul_4k *point, + struct scatterlist *coeffs_sgl, + unsigned int bytes_of_coeffs) +{ + be128 value = {0}; + struct sg_mapping_iter miter; + unsigned int remaining = bytes_of_coeffs; + unsigned int needed = 0; + + sg_miter_start(&miter, coeffs_sgl, sg_nents(coeffs_sgl), + SG_MITER_FROM_SG | SG_MITER_ATOMIC); + while (remaining) { + be128 coeff; + const u8 *src; + unsigned int srclen; + u8 *dst = (u8 *)&value; + + /* + * Note: scatterlist elements are not necessarily evenly + * divisible into blocks, nor are they necessarily aligned to + * __alignof__(be128). + */ + sg_miter_next(&miter); + + src = miter.addr; + srclen = min_t(unsigned int, miter.length, remaining); + remaining -= srclen; + + if (needed) { + unsigned int n = min(srclen, needed); + u8 *pos = dst + (HEH_BLOCK_SIZE - needed); + + needed -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!needed) + gf128mul_4k_ble(&value, point); + } + + while (srclen >= HEH_BLOCK_SIZE) { + memcpy(&coeff, src, HEH_BLOCK_SIZE); + be128_xor(&value, &value, &coeff); + gf128mul_4k_ble(&value, point); + src += HEH_BLOCK_SIZE; + srclen -= HEH_BLOCK_SIZE; + } + + if (srclen) { + needed = HEH_BLOCK_SIZE - srclen; + do { + *dst++ ^= *src++; + } while (--srclen); + } + } + sg_miter_stop(&miter); + return value; +} + +/* + * Split the message into 16 byte blocks, padding out the last block, and use + * the blocks as coefficients in the evaluation of a polynomial over GF(2^128) + * at the secret point 'tau_key'. For ease of implementing the higher-level + * heh_hash_inv() function, the constant and degree-1 coefficients are swapped + * if there is a partial block. + * + * Mathematically, compute: + * if (no partial block) + * k^{N-1} * m_0 + ... + k * m_{N-2} + m_{N-1} + * else if (partial block) + * k^N * m_0 + ... + k^2 * m_{N-2} + k * m_N + m_{N-1} + * + * where: + * t is tau_key + * N is the number of full blocks in the message + * m_i is the i-th full block in the message for i = 0 to N-1 inclusive + * m_N is the partial block of the message zero-padded up to 16 bytes + */ +static be128 poly_hash(struct crypto_ablkcipher *tfm, struct scatterlist *sgl, + unsigned int len) +{ + struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm); + unsigned int tail_offset = get_tail_offset(len); + unsigned int tail_len = len - tail_offset; + be128 hash; + be128 tail[2]; + + /* Handle all full blocks except the last */ + hash = evaluate_polynomial(ctx->tau_key, sgl, tail_offset); + + /* Handle the last full block and the partial block */ + scatterwalk_map_and_copy(tail, sgl, tail_offset, tail_len, 0); + + if (tail_len != HEH_BLOCK_SIZE) { + /* handle the partial block */ + memset((u8 *)tail + tail_len, 0, sizeof(tail) - tail_len); + be128_xor(&hash, &hash, &tail[1]); + gf128mul_4k_ble(&hash, ctx->tau_key); + } + be128_xor(&hash, &hash, &tail[0]); + return hash; +} + +/* + * Transform all full blocks except the last. + * This is used by both the hash and inverse hash phases. + */ +static int heh_tfm_blocks(struct ablkcipher_request *req, + struct scatterlist *src_sgl, + struct scatterlist *dst_sgl, unsigned int len, + const be128 *hash, const be128 *beta_key) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct blkcipher_desc desc = { .flags = req->base.flags }; + struct blkcipher_walk walk; + be128 e = *beta_key; + int err; + unsigned int nbytes; + + blkcipher_walk_init(&walk, dst_sgl, src_sgl, len); + + err = blkcipher_ablkcipher_walk_virt(&desc, &walk, tfm); + + while ((nbytes = walk.nbytes)) { + const be128 *src = (be128 *)walk.src.virt.addr; + be128 *dst = (be128 *)walk.dst.virt.addr; + + do { + gf128mul_x_ble(&e, &e); + be128_xor(dst, src, hash); + be128_xor(dst, dst, &e); + src++; + dst++; + } while ((nbytes -= HEH_BLOCK_SIZE) >= HEH_BLOCK_SIZE); + err = blkcipher_walk_done(&desc, &walk, nbytes); + } + return err; +} + +/* + * The hash phase of HEH. Given a message, compute: + * + * (m_0 + H, ..., m_{N-2} + H, H, m_N) + (xb, x^2b, ..., x^{N-1}b, b, 0) + * + * where: + * N is the number of full blocks in the message + * m_i is the i-th full block in the message for i = 0 to N-1 inclusive + * m_N is the unpadded partial block, possibly empty + * H is the poly_hash() of the message, keyed by tau_key + * b is beta_key + * x is the element x in our representation of GF(2^128) + * + * Note that the partial block remains unchanged, but it does affect the result + * of poly_hash() and therefore the transformation of all the full blocks. + */ +static int heh_hash(struct ablkcipher_request *req, const be128 *beta_key) +{ + be128 hash; + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + unsigned int tail_offset = get_tail_offset(req->nbytes); + unsigned int partial_len = req->nbytes % HEH_BLOCK_SIZE; + int err; + + /* poly_hash() the full message including the partial block */ + hash = poly_hash(tfm, req->src, req->nbytes); + + /* Transform all full blocks except the last */ + err = heh_tfm_blocks(req, req->src, req->dst, tail_offset, &hash, + beta_key); + if (err) + return err; + + /* Set the last full block to hash XOR beta_key */ + be128_xor(&hash, &hash, beta_key); + scatterwalk_map_and_copy(&hash, req->dst, tail_offset, HEH_BLOCK_SIZE, + 1); + + /* Copy the partial block if needed */ + if (partial_len != 0 && req->src != req->dst) { + unsigned int offs = tail_offset + HEH_BLOCK_SIZE; + + scatterwalk_map_and_copy(&hash, req->src, offs, partial_len, 0); + scatterwalk_map_and_copy(&hash, req->dst, offs, partial_len, 1); + } + return 0; +} + +/* + * The inverse hash phase of HEH. This undoes the result of heh_hash(). + */ +static int heh_hash_inv(struct ablkcipher_request *req, const be128 *beta_key) +{ + be128 hash; + be128 tmp; + struct scatterlist tmp_sgl[2]; + struct scatterlist *tail_sgl; + unsigned int len = req->nbytes; + unsigned int tail_offset = get_tail_offset(len); + struct scatterlist *sgl = req->dst; + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + int err; + + /* + * The last full block was computed as hash XOR beta_key, so XOR it with + * beta_key to recover hash. + */ + tail_sgl = scatterwalk_ffwd(tmp_sgl, sgl, tail_offset); + scatterwalk_map_and_copy(&hash, tail_sgl, 0, HEH_BLOCK_SIZE, 0); + be128_xor(&hash, &hash, beta_key); + + /* Transform all full blocks except the last */ + err = heh_tfm_blocks(req, sgl, sgl, tail_offset, &hash, beta_key); + if (err) + return err; + + /* + * Recover the last full block. We know 'hash', i.e. the poly_hash() of + * the the original message. The last full block was the constant term + * of the polynomial. To recover the last full block, temporarily zero + * it, compute the poly_hash(), and take the difference from 'hash'. + */ + memset(&tmp, 0, sizeof(tmp)); + scatterwalk_map_and_copy(&tmp, tail_sgl, 0, HEH_BLOCK_SIZE, 1); + tmp = poly_hash(tfm, sgl, len); + be128_xor(&tmp, &tmp, &hash); + scatterwalk_map_and_copy(&tmp, tail_sgl, 0, HEH_BLOCK_SIZE, 1); + return 0; +} + +static int heh_hash_inv_step(struct ablkcipher_request *req, u32 flags) +{ + struct heh_req_ctx *rctx = heh_req_ctx(req); + + return heh_hash_inv(req, &rctx->beta2_key); +} + +static int heh_ecb_step_3(struct ablkcipher_request *req, u32 flags) +{ + struct heh_req_ctx *rctx = heh_req_ctx(req); + u8 partial_block[HEH_BLOCK_SIZE] __aligned(__alignof__(u32)); + unsigned int tail_offset = get_tail_offset(req->nbytes); + unsigned int partial_offset = tail_offset + HEH_BLOCK_SIZE; + unsigned int partial_len = req->nbytes - partial_offset; + + /* + * Extract the pad in req->dst at tail_offset, and xor the partial block + * with it to create encrypted partial block + */ + scatterwalk_map_and_copy(rctx->u.ecb.keystream, req->dst, tail_offset, + HEH_BLOCK_SIZE, 0); + scatterwalk_map_and_copy(partial_block, req->dst, partial_offset, + partial_len, 0); + crypto_xor(partial_block, rctx->u.ecb.keystream, partial_len); + + /* + * Store the encrypted final block and partial block back in dst_sg + */ + scatterwalk_map_and_copy(&rctx->u.ecb.tmp, req->dst, tail_offset, + HEH_BLOCK_SIZE, 1); + scatterwalk_map_and_copy(partial_block, req->dst, partial_offset, + partial_len, 1); + + return heh_hash_inv_step(req, flags); +} + +static void heh_ecb_step_2_done(struct crypto_async_request *areq, int err) +{ + return async_done(areq, err, heh_ecb_step_3); +} + +static int heh_ecb_step_2(struct ablkcipher_request *req, u32 flags) +{ + struct heh_req_ctx *rctx = heh_req_ctx(req); + unsigned int partial_len = req->nbytes % HEH_BLOCK_SIZE; + struct scatterlist *tmp_sgl; + int err; + unsigned int tail_offset = get_tail_offset(req->nbytes); + + if (partial_len == 0) + return heh_hash_inv_step(req, flags); + + /* + * Extract the final full block, store it in tmp, and then xor that with + * the value saved in u.ecb.keystream + */ + scatterwalk_map_and_copy(rctx->u.ecb.tmp, req->dst, tail_offset, + HEH_BLOCK_SIZE, 0); + crypto_xor(rctx->u.ecb.keystream, rctx->u.ecb.tmp, HEH_BLOCK_SIZE); + + /* + * Encrypt the value in rctx->u.ecb.keystream to create the pad for the + * partial block. + * We cannot encrypt stack buffers, so re-use the dst_sg to do this + * encryption to avoid a malloc. The value at tail_offset is stored in + * tmp, and will be restored later. + */ + scatterwalk_map_and_copy(rctx->u.ecb.keystream, req->dst, tail_offset, + HEH_BLOCK_SIZE, 1); + tmp_sgl = scatterwalk_ffwd(rctx->u.ecb.tmp_sgl, req->dst, tail_offset); + ablkcipher_request_set_callback(&rctx->u.ecb.req, flags, + heh_ecb_step_2_done, req); + ablkcipher_request_set_crypt(&rctx->u.ecb.req, tmp_sgl, tmp_sgl, + HEH_BLOCK_SIZE, NULL); + err = crypto_ablkcipher_encrypt(&rctx->u.ecb.req); + if (err) + return err; + return heh_ecb_step_3(req, flags); +} + +static void heh_ecb_full_done(struct crypto_async_request *areq, int err) +{ + return async_done(areq, err, heh_ecb_step_2); +} + +/* + * The encrypt phase of HEH. This uses ECB encryption, with special handling + * for the partial block at the end if any. The source data is already in + * req->dst, so the encryption happens in-place. + * + * After the encrypt phase we continue on to the inverse hash phase. The + * functions calls are chained to support asynchronous ECB algorithms. + */ +static int heh_ecb(struct ablkcipher_request *req, bool decrypt) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct heh_req_ctx *rctx = heh_req_ctx(req); + struct ablkcipher_request *ecb_req = &rctx->u.ecb.req; + unsigned int tail_offset = get_tail_offset(req->nbytes); + unsigned int full_len = tail_offset + HEH_BLOCK_SIZE; + int err; + + /* + * Save the last full block before it is encrypted/decrypted. This will + * be used later to encrypt/decrypt the partial block + */ + scatterwalk_map_and_copy(rctx->u.ecb.keystream, req->dst, tail_offset, + HEH_BLOCK_SIZE, 0); + + /* Encrypt/decrypt all full blocks */ + ablkcipher_request_set_tfm(ecb_req, ctx->ecb); + ablkcipher_request_set_callback(ecb_req, req->base.flags, + heh_ecb_full_done, req); + ablkcipher_request_set_crypt(ecb_req, req->dst, req->dst, full_len, + NULL); + if (decrypt) + err = crypto_ablkcipher_decrypt(ecb_req); + else + err = crypto_ablkcipher_encrypt(ecb_req); + if (err) + return err; + + return heh_ecb_step_2(req, req->base.flags); +} + +static int heh_crypt(struct ablkcipher_request *req, bool decrypt) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct heh_req_ctx *rctx = heh_req_ctx(req); + int err; + + /* Inputs must be at least one full block */ + if (req->nbytes < HEH_BLOCK_SIZE) + return -EINVAL; + + /* Key must have been set */ + if (!ctx->tau_key) + return -ENOKEY; + err = generate_betas(req, &rctx->beta1_key, &rctx->beta2_key); + if (err) + return err; + + if (decrypt) + swap(rctx->beta1_key, rctx->beta2_key); + + err = heh_hash(req, &rctx->beta1_key); + if (err) + return err; + + return heh_ecb(req, decrypt); +} + +static int heh_encrypt(struct ablkcipher_request *req) +{ + return heh_crypt(req, false); +} + +static int heh_decrypt(struct ablkcipher_request *req) +{ + return heh_crypt(req, true); +} + +static int heh_setkey(struct crypto_ablkcipher *parent, const u8 *key, + unsigned int keylen) +{ + struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(parent); + struct crypto_shash *cmac = ctx->cmac; + struct crypto_ablkcipher *ecb = ctx->ecb; + SHASH_DESC_ON_STACK(desc, cmac); + u8 *derived_keys; + u8 digest[HEH_BLOCK_SIZE]; + unsigned int i; + int err; + + /* set prf_key = key */ + crypto_shash_clear_flags(cmac, CRYPTO_TFM_REQ_MASK); + crypto_shash_set_flags(cmac, crypto_ablkcipher_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_shash_setkey(cmac, key, keylen); + crypto_ablkcipher_set_flags(parent, crypto_shash_get_flags(cmac) & + CRYPTO_TFM_RES_MASK); + if (err) + return err; + + /* + * Generate tau_key and ecb_key as follows: + * tau_key = cmac(prf_key, 0x00...01) + * ecb_key = cmac(prf_key, 0x00...02) || cmac(prf_key, 0x00...03) || ... + * truncated to keylen bytes + */ + derived_keys = kzalloc(round_up(HEH_BLOCK_SIZE + keylen, + HEH_BLOCK_SIZE), GFP_KERNEL); + if (!derived_keys) + return -ENOMEM; + desc->tfm = cmac; + desc->flags = (crypto_shash_get_flags(cmac) & CRYPTO_TFM_REQ_MASK); + for (i = 0; i < keylen + HEH_BLOCK_SIZE; i += HEH_BLOCK_SIZE) { + derived_keys[i + HEH_BLOCK_SIZE - 1] = + 0x01 + i / HEH_BLOCK_SIZE; + err = crypto_shash_digest(desc, derived_keys + i, + HEH_BLOCK_SIZE, digest); + if (err) + goto out; + memcpy(derived_keys + i, digest, HEH_BLOCK_SIZE); + } + + if (ctx->tau_key) + gf128mul_free_4k(ctx->tau_key); + err = -ENOMEM; + ctx->tau_key = gf128mul_init_4k_ble((const be128 *)derived_keys); + if (!ctx->tau_key) + goto out; + + crypto_ablkcipher_clear_flags(ecb, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(ecb, crypto_ablkcipher_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(ecb, derived_keys + HEH_BLOCK_SIZE, + keylen); + crypto_ablkcipher_set_flags(parent, crypto_ablkcipher_get_flags(ecb) & + CRYPTO_TFM_RES_MASK); +out: + kzfree(derived_keys); + return err; +} + +static int heh_init_tfm(struct crypto_tfm *tfm) +{ + struct crypto_instance *inst = crypto_tfm_alg_instance(tfm); + struct heh_instance_ctx *ictx = crypto_instance_ctx(inst); + struct heh_tfm_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_shash *cmac; + struct crypto_ablkcipher *ecb; + unsigned int reqsize; + int err; + + cmac = crypto_spawn_shash(&ictx->cmac); + if (IS_ERR(cmac)) + return PTR_ERR(cmac); + + ecb = crypto_spawn_skcipher(&ictx->ecb); + err = PTR_ERR(ecb); + if (IS_ERR(ecb)) + goto err_free_cmac; + + ctx->cmac = cmac; + ctx->ecb = ecb; + + reqsize = crypto_tfm_alg_alignmask(tfm) & + ~(crypto_tfm_ctx_alignment() - 1); + reqsize += max(offsetof(struct heh_req_ctx, u.cmac.desc) + + sizeof(struct shash_desc) + + crypto_shash_descsize(cmac), + offsetof(struct heh_req_ctx, u.ecb.req) + + sizeof(struct ablkcipher_request) + + crypto_ablkcipher_reqsize(ecb)); + tfm->crt_ablkcipher.reqsize = reqsize; + return 0; + +err_free_cmac: + crypto_free_shash(cmac); + return err; +} + +static void heh_exit_tfm(struct crypto_tfm *tfm) +{ + struct heh_tfm_ctx *ctx = crypto_tfm_ctx(tfm); + + gf128mul_free_4k(ctx->tau_key); + crypto_free_shash(ctx->cmac); + crypto_free_ablkcipher(ctx->ecb); +} + +static void heh_free_instance(struct crypto_instance *inst) +{ + struct heh_instance_ctx *ctx = crypto_instance_ctx(inst); + + crypto_drop_shash(&ctx->cmac); + crypto_drop_skcipher(&ctx->ecb); + kfree(inst); +} + +/* + * Create an instance of HEH as a ablkcipher. + * + * This relies on underlying CMAC and ECB algorithms, usually cmac(aes) and + * ecb(aes). For performance reasons we support asynchronous ECB algorithms. + * However, we do not yet support asynchronous CMAC algorithms because CMAC is + * only used on a small fixed amount of data per request, independent of the + * request length. This would change if AEAD or variable-length nonce support + * were to be exposed. + */ +static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb, + const char *full_name, const char *cmac_name, + const char *ecb_name) +{ + struct crypto_attr_type *algt; + struct crypto_instance *inst; + struct heh_instance_ctx *ctx; + struct shash_alg *cmac; + struct crypto_alg *ecb; + int err; + + algt = crypto_get_attr_type(tb); + if (IS_ERR(algt)) + return PTR_ERR(algt); + + /* User must be asking for something compatible with ablkcipher */ + if ((algt->type ^ CRYPTO_ALG_TYPE_ABLKCIPHER) & algt->mask) + return -EINVAL; + + /* Allocate the ablkcipher instance */ + inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); + if (!inst) + return -ENOMEM; + + ctx = crypto_instance_ctx(inst); + + /* Set up the cmac and ecb spawns */ + + ctx->cmac.base.inst = inst; + err = crypto_grab_shash(&ctx->cmac, cmac_name, 0, CRYPTO_ALG_ASYNC); + if (err) + goto err_free_inst; + cmac = crypto_spawn_shash_alg(&ctx->cmac); + err = -EINVAL; + if (cmac->digestsize != HEH_BLOCK_SIZE) + goto err_drop_cmac; + + ctx->ecb.base.inst = inst; + err = crypto_grab_skcipher(&ctx->ecb, ecb_name, 0, + crypto_requires_sync(algt->type, + algt->mask)); + if (err) + goto err_drop_cmac; + ecb = crypto_skcipher_spawn_alg(&ctx->ecb); + + /* HEH only supports block ciphers with 16 byte block size */ + err = -EINVAL; + if (ecb->cra_blocksize != HEH_BLOCK_SIZE) + goto err_drop_ecb; + + /* The underlying "ECB" algorithm must not require an IV */ + err = -EINVAL; + if ((ecb->cra_flags & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_BLKCIPHER) { + if (ecb->cra_blkcipher.ivsize != 0) + goto err_drop_ecb; + } else { + if (ecb->cra_ablkcipher.ivsize != 0) + goto err_drop_ecb; + } + + /* Set the instance names */ + err = -ENAMETOOLONG; + if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, + "heh_base(%s,%s)", cmac->base.cra_driver_name, + ecb->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) + goto err_drop_ecb; + + err = -ENAMETOOLONG; + if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, + "%s", full_name) >= CRYPTO_MAX_ALG_NAME) + goto err_drop_ecb; + + /* Finish initializing the instance */ + + inst->alg.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | + ((cmac->base.cra_flags | ecb->cra_flags) & + CRYPTO_ALG_ASYNC); + inst->alg.cra_blocksize = HEH_BLOCK_SIZE; + inst->alg.cra_ctxsize = sizeof(struct heh_tfm_ctx); + inst->alg.cra_alignmask = ecb->cra_alignmask | (__alignof__(be128) - 1); + inst->alg.cra_priority = ecb->cra_priority; + inst->alg.cra_type = &crypto_ablkcipher_type; + inst->alg.cra_init = heh_init_tfm; + inst->alg.cra_exit = heh_exit_tfm; + + inst->alg.cra_ablkcipher.setkey = heh_setkey; + inst->alg.cra_ablkcipher.encrypt = heh_encrypt; + inst->alg.cra_ablkcipher.decrypt = heh_decrypt; + if ((ecb->cra_flags & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_BLKCIPHER) { + inst->alg.cra_ablkcipher.min_keysize = ecb->cra_blkcipher.min_keysize; + inst->alg.cra_ablkcipher.max_keysize = ecb->cra_blkcipher.max_keysize; + } else { + inst->alg.cra_ablkcipher.min_keysize = ecb->cra_ablkcipher.min_keysize; + inst->alg.cra_ablkcipher.max_keysize = ecb->cra_ablkcipher.max_keysize; + } + inst->alg.cra_ablkcipher.ivsize = HEH_BLOCK_SIZE; + + /* Register the instance */ + err = crypto_register_instance(tmpl, inst); + if (err) + goto err_drop_ecb; + return 0; + +err_drop_ecb: + crypto_drop_skcipher(&ctx->ecb); +err_drop_cmac: + crypto_drop_shash(&ctx->cmac); +err_free_inst: + kfree(inst); + return err; +} + +static int heh_create(struct crypto_template *tmpl, struct rtattr **tb) +{ + const char *cipher_name; + char full_name[CRYPTO_MAX_ALG_NAME]; + char cmac_name[CRYPTO_MAX_ALG_NAME]; + char ecb_name[CRYPTO_MAX_ALG_NAME]; + + /* Get the name of the requested block cipher (e.g. aes) */ + cipher_name = crypto_attr_alg_name(tb[1]); + if (IS_ERR(cipher_name)) + return PTR_ERR(cipher_name); + + if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "heh(%s)", cipher_name) >= + CRYPTO_MAX_ALG_NAME) + return -ENAMETOOLONG; + + if (snprintf(cmac_name, CRYPTO_MAX_ALG_NAME, "cmac(%s)", cipher_name) >= + CRYPTO_MAX_ALG_NAME) + return -ENAMETOOLONG; + + if (snprintf(ecb_name, CRYPTO_MAX_ALG_NAME, "ecb(%s)", cipher_name) >= + CRYPTO_MAX_ALG_NAME) + return -ENAMETOOLONG; + + return heh_create_common(tmpl, tb, full_name, cmac_name, ecb_name); +} + +static struct crypto_template heh_tmpl = { + .name = "heh", + .create = heh_create, + .free = heh_free_instance, + .module = THIS_MODULE, +}; + +static int heh_base_create(struct crypto_template *tmpl, struct rtattr **tb) +{ + char full_name[CRYPTO_MAX_ALG_NAME]; + const char *cmac_name; + const char *ecb_name; + + cmac_name = crypto_attr_alg_name(tb[1]); + if (IS_ERR(cmac_name)) + return PTR_ERR(cmac_name); + + ecb_name = crypto_attr_alg_name(tb[2]); + if (IS_ERR(ecb_name)) + return PTR_ERR(ecb_name); + + if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "heh_base(%s,%s)", + cmac_name, ecb_name) >= CRYPTO_MAX_ALG_NAME) + return -ENAMETOOLONG; + + return heh_create_common(tmpl, tb, full_name, cmac_name, ecb_name); +} + +/* + * If HEH is instantiated as "heh_base" instead of "heh", then specific + * implementations of cmac and ecb can be specified instead of just the cipher + */ +static struct crypto_template heh_base_tmpl = { + .name = "heh_base", + .create = heh_base_create, + .free = heh_free_instance, + .module = THIS_MODULE, +}; + +static int __init heh_module_init(void) +{ + int err; + + err = crypto_register_template(&heh_tmpl); + if (err) + return err; + + err = crypto_register_template(&heh_base_tmpl); + if (err) + goto out_undo_heh; + + return 0; + +out_undo_heh: + crypto_unregister_template(&heh_tmpl); + return err; +} + +static void __exit heh_module_exit(void) +{ + crypto_unregister_template(&heh_tmpl); + crypto_unregister_template(&heh_base_tmpl); +} + +module_init(heh_module_init); +module_exit(heh_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Hash-Encrypt-Hash block cipher mode"); +MODULE_ALIAS_CRYPTO("heh"); +MODULE_ALIAS_CRYPTO("heh_base"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index ae8c57fd8bc7..f03ae20a5735 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -3196,6 +3196,21 @@ static const struct alg_test_desc alg_test_descs[] = { .count = GHASH_TEST_VECTORS } } + }, { + .alg = "heh(aes)", + .test = alg_test_skcipher, + .suite = { + .cipher = { + .enc = { + .vecs = aes_heh_enc_tv_template, + .count = AES_HEH_ENC_TEST_VECTORS + }, + .dec = { + .vecs = aes_heh_dec_tv_template, + .count = AES_HEH_DEC_TEST_VECTORS + } + } + } }, { .alg = "hmac(crc32)", .test = alg_test_hash, diff --git a/crypto/testmgr.h b/crypto/testmgr.h index da0a8fd765f4..97a523993bd8 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -14139,6 +14139,8 @@ static struct cipher_testvec cast6_xts_dec_tv_template[] = { #define AES_DEC_TEST_VECTORS 4 #define AES_CBC_ENC_TEST_VECTORS 5 #define AES_CBC_DEC_TEST_VECTORS 5 +#define AES_HEH_ENC_TEST_VECTORS 4 +#define AES_HEH_DEC_TEST_VECTORS 4 #define HMAC_MD5_ECB_CIPHER_NULL_ENC_TEST_VECTORS 2 #define HMAC_MD5_ECB_CIPHER_NULL_DEC_TEST_VECTORS 2 #define HMAC_SHA1_ECB_CIPHER_NULL_ENC_TEST_VEC 2 @@ -14511,6 +14513,198 @@ static struct cipher_testvec aes_dec_tv_template[] = { }, }; +static struct cipher_testvec aes_heh_enc_tv_template[] = { + { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", + .klen = 16, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .input = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", + .ilen = 16, + .result = "\xd8\xbd\x40\xbf\xca\xe5\xee\x81" + "\x0f\x3d\x1f\x1f\xae\x89\x07\x55", + .rlen = 16, + .also_non_np = 1, + .np = 2, + .tap = { 8, 8 }, + }, { + .key = "\xa8\xda\x24\x9b\x5e\xfa\x13\xc2" + "\xc1\x94\xbf\x32\xba\x38\xa3\x77", + .klen = 16, + .iv = "\x4d\x47\x61\x37\x2b\x47\x86\xf0" + "\xd6\x47\xb5\xc2\xe8\xcf\x85\x27", + .input = "\xb8\xee\x29\xe4\xa5\xd1\xe7\x55" + "\xd0\xfd\xe7\x22\x63\x76\x36\xe2" + "\xf8\x0c\xf8\xfe\x65\x76\xe7\xca" + "\xc1\x42\xf5\xca\x5a\xa8\xac\x2a", + .ilen = 32, + .result = "\x59\xf2\x78\x4e\x10\x94\xf9\x5c" + "\x22\x23\x78\x2a\x30\x48\x11\x97" + "\xb1\xfe\x70\xc4\xef\xdf\x04\xef" + "\x16\x39\x04\xcf\xc0\x95\x9a\x98", + .rlen = 32, + .also_non_np = 1, + .np = 3, + .tap = { 16, 13, 3 }, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", + .klen = 16, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .input = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00", + .ilen = 63, + .result = "\xe0\x40\xeb\xe9\x52\xbe\x65\x60" + "\xe4\x68\x68\xa3\x73\x75\xb8\x52" + "\xef\x38\x6a\x87\x25\x25\xf6\x04" + "\xe5\x8e\xbe\x14\x8b\x02\x14\x1f" + "\xa9\x73\xb7\xad\x15\xbe\x9c\xa0" + "\xd2\x8a\x2c\xdc\xd4\xe3\x05\x55" + "\x0a\xf5\xf8\x51\xee\xe5\x62\xa5" + "\x71\xa7\x7c\x15\x5d\x7a\x9e", + .rlen = 63, + .also_non_np = 1, + .np = 8, + .tap = { 20, 20, 10, 8, 2, 1, 1, 1 }, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" + "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" + "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", + .klen = 16, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .input = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x01" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00", + .ilen = 63, + .result = "\x4b\x1a\x15\xa0\xaf\x08\x6d\x70" + "\xf0\xa7\x97\xb5\x31\x4b\x8c\xc3" + "\x4d\xf2\x7a\x9d\xdd\xd4\x15\x99" + "\x57\xad\xc6\xb1\x35\x69\xf5\x6a" + "\x2d\x70\xe4\x97\x49\xb2\x9f\x71" + "\xde\x22\xb5\x70\x8c\x69\x24\xd3" + "\xad\x80\x58\x48\x90\xe4\xed\xba" + "\x76\x3d\x71\x7c\x57\x25\x87", + .rlen = 63, + .also_non_np = 1, + .np = 8, + .tap = { 20, 20, 10, 8, 2, 1, 1, 1 }, + } +}; + +static struct cipher_testvec aes_heh_dec_tv_template[] = { + { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", + .klen = 16, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .input = "\xd8\xbd\x40\xbf\xca\xe5\xee\x81" + "\x0f\x3d\x1f\x1f\xae\x89\x07\x55", + .ilen = 16, + .result = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", + .rlen = 16, + .also_non_np = 1, + .np = 2, + .tap = { 8, 8 }, + }, { + .key = "\xa8\xda\x24\x9b\x5e\xfa\x13\xc2" + "\xc1\x94\xbf\x32\xba\x38\xa3\x77", + .klen = 16, + .iv = "\x4d\x47\x61\x37\x2b\x47\x86\xf0" + "\xd6\x47\xb5\xc2\xe8\xcf\x85\x27", + .input = "\x59\xf2\x78\x4e\x10\x94\xf9\x5c" + "\x22\x23\x78\x2a\x30\x48\x11\x97" + "\xb1\xfe\x70\xc4\xef\xdf\x04\xef" + "\x16\x39\x04\xcf\xc0\x95\x9a\x98", + .ilen = 32, + .result = "\xb8\xee\x29\xe4\xa5\xd1\xe7\x55" + "\xd0\xfd\xe7\x22\x63\x76\x36\xe2" + "\xf8\x0c\xf8\xfe\x65\x76\xe7\xca" + "\xc1\x42\xf5\xca\x5a\xa8\xac\x2a", + .rlen = 32, + .also_non_np = 1, + .np = 3, + .tap = { 16, 13, 3 }, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", + .klen = 16, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .input = "\xe0\x40\xeb\xe9\x52\xbe\x65\x60" + "\xe4\x68\x68\xa3\x73\x75\xb8\x52" + "\xef\x38\x6a\x87\x25\x25\xf6\x04" + "\xe5\x8e\xbe\x14\x8b\x02\x14\x1f" + "\xa9\x73\xb7\xad\x15\xbe\x9c\xa0" + "\xd2\x8a\x2c\xdc\xd4\xe3\x05\x55" + "\x0a\xf5\xf8\x51\xee\xe5\x62\xa5" + "\x71\xa7\x7c\x15\x5d\x7a\x9e", + .ilen = 63, + .result = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00", + .rlen = 63, + .also_non_np = 1, + .np = 8, + .tap = { 20, 20, 10, 8, 2, 1, 1, 1 }, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" + "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" + "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", + .klen = 16, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + .input = "\x4b\x1a\x15\xa0\xaf\x08\x6d\x70" + "\xf0\xa7\x97\xb5\x31\x4b\x8c\xc3" + "\x4d\xf2\x7a\x9d\xdd\xd4\x15\x99" + "\x57\xad\xc6\xb1\x35\x69\xf5\x6a" + "\x2d\x70\xe4\x97\x49\xb2\x9f\x71" + "\xde\x22\xb5\x70\x8c\x69\x24\xd3" + "\xad\x80\x58\x48\x90\xe4\xed\xba" + "\x76\x3d\x71\x7c\x57\x25\x87", + .ilen = 63, + .result = "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x01" + "\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00", + .rlen = 63, + .also_non_np = 1, + .np = 8, + .tap = { 20, 20, 10, 8, 2, 1, 1, 1 }, + } +}; + static struct cipher_testvec aes_cbc_enc_tv_template[] = { { /* From RFC 3602 */ .key = "\x06\xa9\x21\x40\x36\xb8\xa1\x5b" From 58b9edb065b0b75a794f9b6f80e42e992bde72d2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 11 Jan 2017 10:36:41 -0800 Subject: [PATCH 0664/3220] ANDROID: crypto: heh - factor out poly_hash algorithm Factor most of poly_hash() out into its own keyed hash algorithm so that optimized architecture-specific implementations of it will be possible. For now we call poly_hash through the shash API, since HEH already had an example of using shash for another algorithm (CMAC), and we will not be adding any poly_hash implementations that require ahash yet. We can however switch to ahash later if it becomes useful. Bug: 32508661 Signed-off-by: Eric Biggers Change-Id: I8de54ddcecd1d7fa6e9842a09506a08129bae0b6 --- crypto/heh.c | 330 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 232 insertions(+), 98 deletions(-) diff --git a/crypto/heh.c b/crypto/heh.c index 48a284cecaa2..10c00aaf797e 100644 --- a/crypto/heh.c +++ b/crypto/heh.c @@ -37,13 +37,14 @@ struct heh_instance_ctx { struct crypto_shash_spawn cmac; + struct crypto_shash_spawn poly_hash; struct crypto_skcipher_spawn ecb; }; struct heh_tfm_ctx { struct crypto_shash *cmac; + struct crypto_shash *poly_hash; /* keyed with tau_key */ struct crypto_ablkcipher *ecb; - struct gf128mul_4k *tau_key; }; struct heh_cmac_data { @@ -63,6 +64,10 @@ struct heh_req_ctx { /* aligned to alignmask */ struct shash_desc desc; /* + crypto_shash_descsize(cmac) */ } cmac; + struct { + struct shash_desc desc; + /* + crypto_shash_descsize(poly_hash) */ + } poly_hash; struct { u8 keystream[HEH_BLOCK_SIZE]; u8 tmp[HEH_BLOCK_SIZE]; @@ -157,73 +162,138 @@ static int generate_betas(struct ablkcipher_request *req, return 0; } +/*****************************************************************************/ + /* - * Evaluation of a polynomial over GF(2^128) using Horner's rule. The - * polynomial is evaluated at 'point'. The polynomial's coefficients are taken - * from 'coeffs_sgl' and are for terms with consecutive descending degree ending - * at degree 1. 'bytes_of_coeffs' is 16 times the number of terms. + * This is the generic version of poly_hash. It does the GF(2^128) + * multiplication by 'tau_key' using a precomputed table, without using any + * special CPU instructions. On some platforms, an accelerated version (with + * higher cra_priority) may be used instead. */ -static be128 evaluate_polynomial(struct gf128mul_4k *point, - struct scatterlist *coeffs_sgl, - unsigned int bytes_of_coeffs) + +struct poly_hash_tfm_ctx { + struct gf128mul_4k *tau_key; +}; + +struct poly_hash_desc_ctx { + be128 digest; + unsigned int count; +}; + +static int poly_hash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) { - be128 value = {0}; - struct sg_mapping_iter miter; - unsigned int remaining = bytes_of_coeffs; - unsigned int needed = 0; + struct poly_hash_tfm_ctx *tctx = crypto_shash_ctx(tfm); + be128 key128; - sg_miter_start(&miter, coeffs_sgl, sg_nents(coeffs_sgl), - SG_MITER_FROM_SG | SG_MITER_ATOMIC); - while (remaining) { - be128 coeff; - const u8 *src; - unsigned int srclen; - u8 *dst = (u8 *)&value; + if (keylen != HEH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } - /* - * Note: scatterlist elements are not necessarily evenly - * divisible into blocks, nor are they necessarily aligned to - * __alignof__(be128). - */ - sg_miter_next(&miter); + if (tctx->tau_key) + gf128mul_free_4k(tctx->tau_key); + memcpy(&key128, key, HEH_BLOCK_SIZE); + tctx->tau_key = gf128mul_init_4k_ble(&key128); + if (!tctx->tau_key) + return -ENOMEM; + return 0; +} - src = miter.addr; - srclen = min_t(unsigned int, miter.length, remaining); - remaining -= srclen; +static int poly_hash_init(struct shash_desc *desc) +{ + struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc); - if (needed) { - unsigned int n = min(srclen, needed); - u8 *pos = dst + (HEH_BLOCK_SIZE - needed); + ctx->digest = (be128) { 0 }; + ctx->count = 0; + return 0; +} - needed -= n; - srclen -= n; +static int poly_hash_update(struct shash_desc *desc, const u8 *src, + unsigned int len) +{ + struct poly_hash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % HEH_BLOCK_SIZE; + u8 *dst = (u8 *)&ctx->digest + partial; - while (n--) - *pos++ ^= *src++; + ctx->count += len; - if (!needed) - gf128mul_4k_ble(&value, point); - } + /* Finishing at least one block? */ + if (partial + len >= HEH_BLOCK_SIZE) { - while (srclen >= HEH_BLOCK_SIZE) { - memcpy(&coeff, src, HEH_BLOCK_SIZE); - be128_xor(&value, &value, &coeff); - gf128mul_4k_ble(&value, point); - src += HEH_BLOCK_SIZE; - srclen -= HEH_BLOCK_SIZE; - } + if (partial) { + /* Finish the pending block. */ + unsigned int n = HEH_BLOCK_SIZE - partial; - if (srclen) { - needed = HEH_BLOCK_SIZE - srclen; + len -= n; do { *dst++ ^= *src++; - } while (--srclen); + } while (--n); + + gf128mul_4k_ble(&ctx->digest, tctx->tau_key); } + + /* Process zero or more full blocks. */ + while (len >= HEH_BLOCK_SIZE) { + be128 coeff; + + memcpy(&coeff, src, HEH_BLOCK_SIZE); + be128_xor(&ctx->digest, &ctx->digest, &coeff); + src += HEH_BLOCK_SIZE; + len -= HEH_BLOCK_SIZE; + gf128mul_4k_ble(&ctx->digest, tctx->tau_key); + } + dst = (u8 *)&ctx->digest; } - sg_miter_stop(&miter); - return value; + + /* Continue adding the next block to 'digest'. */ + while (len--) + *dst++ ^= *src++; + return 0; } +static int poly_hash_final(struct shash_desc *desc, u8 *out) +{ + struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc); + + /* Finish the last block if needed. */ + if (ctx->count % HEH_BLOCK_SIZE) { + struct poly_hash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + + gf128mul_4k_ble(&ctx->digest, tctx->tau_key); + } + + memcpy(out, &ctx->digest, HEH_BLOCK_SIZE); + return 0; +} + +static void poly_hash_exit(struct crypto_tfm *tfm) +{ + struct poly_hash_tfm_ctx *tctx = crypto_tfm_ctx(tfm); + + gf128mul_free_4k(tctx->tau_key); +} + +static struct shash_alg poly_hash_alg = { + .digestsize = HEH_BLOCK_SIZE, + .init = poly_hash_init, + .update = poly_hash_update, + .final = poly_hash_final, + .setkey = poly_hash_setkey, + .descsize = sizeof(struct poly_hash_desc_ctx), + .base = { + .cra_name = "poly_hash", + .cra_driver_name = "poly_hash-generic", + .cra_priority = 100, + .cra_ctxsize = sizeof(struct poly_hash_tfm_ctx), + .cra_exit = poly_hash_exit, + .cra_module = THIS_MODULE, + }, +}; + +/*****************************************************************************/ + /* * Split the message into 16 byte blocks, padding out the last block, and use * the blocks as coefficients in the evaluation of a polynomial over GF(2^128) @@ -242,18 +312,42 @@ static be128 evaluate_polynomial(struct gf128mul_4k *point, * N is the number of full blocks in the message * m_i is the i-th full block in the message for i = 0 to N-1 inclusive * m_N is the partial block of the message zero-padded up to 16 bytes + * + * Note that most of this is now separated out into its own keyed hash + * algorithm, to allow optimized implementations. However, we still handle the + * swapping of the last two coefficients here in the HEH template because this + * simplifies the poly_hash algorithms: they don't have to buffer an extra + * block, don't have to duplicate as much code, and are more similar to GHASH. */ -static be128 poly_hash(struct crypto_ablkcipher *tfm, struct scatterlist *sgl, - unsigned int len) +static int poly_hash(struct ablkcipher_request *req, struct scatterlist *sgl, + be128 *hash) { + struct heh_req_ctx *rctx = heh_req_ctx(req); + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm); - unsigned int tail_offset = get_tail_offset(len); - unsigned int tail_len = len - tail_offset; - be128 hash; + struct shash_desc *desc = &rctx->u.poly_hash.desc; + unsigned int tail_offset = get_tail_offset(req->nbytes); + unsigned int tail_len = req->nbytes - tail_offset; be128 tail[2]; + unsigned int i, n; + struct sg_mapping_iter miter; + int err; + + desc->tfm = ctx->poly_hash; + desc->flags = req->base.flags; /* Handle all full blocks except the last */ - hash = evaluate_polynomial(ctx->tau_key, sgl, tail_offset); + err = crypto_shash_init(desc); + sg_miter_start(&miter, sgl, sg_nents(sgl), + SG_MITER_FROM_SG | SG_MITER_ATOMIC); + for (i = 0; i < tail_offset && !err; i += n) { + sg_miter_next(&miter); + n = min_t(unsigned int, miter.length, tail_offset - i); + err = crypto_shash_update(desc, miter.addr, n); + } + sg_miter_stop(&miter); + if (err) + return err; /* Handle the last full block and the partial block */ scatterwalk_map_and_copy(tail, sgl, tail_offset, tail_len, 0); @@ -261,11 +355,15 @@ static be128 poly_hash(struct crypto_ablkcipher *tfm, struct scatterlist *sgl, if (tail_len != HEH_BLOCK_SIZE) { /* handle the partial block */ memset((u8 *)tail + tail_len, 0, sizeof(tail) - tail_len); - be128_xor(&hash, &hash, &tail[1]); - gf128mul_4k_ble(&hash, ctx->tau_key); + err = crypto_shash_update(desc, (u8 *)&tail[1], HEH_BLOCK_SIZE); + if (err) + return err; } - be128_xor(&hash, &hash, &tail[0]); - return hash; + err = crypto_shash_final(desc, (u8 *)hash); + if (err) + return err; + be128_xor(hash, hash, &tail[0]); + return 0; } /* @@ -323,13 +421,14 @@ static int heh_tfm_blocks(struct ablkcipher_request *req, static int heh_hash(struct ablkcipher_request *req, const be128 *beta_key) { be128 hash; - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); unsigned int tail_offset = get_tail_offset(req->nbytes); unsigned int partial_len = req->nbytes % HEH_BLOCK_SIZE; int err; /* poly_hash() the full message including the partial block */ - hash = poly_hash(tfm, req->src, req->nbytes); + err = poly_hash(req, req->src, &hash); + if (err) + return err; /* Transform all full blocks except the last */ err = heh_tfm_blocks(req, req->src, req->dst, tail_offset, &hash, @@ -361,10 +460,8 @@ static int heh_hash_inv(struct ablkcipher_request *req, const be128 *beta_key) be128 tmp; struct scatterlist tmp_sgl[2]; struct scatterlist *tail_sgl; - unsigned int len = req->nbytes; - unsigned int tail_offset = get_tail_offset(len); + unsigned int tail_offset = get_tail_offset(req->nbytes); struct scatterlist *sgl = req->dst; - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); int err; /* @@ -388,7 +485,9 @@ static int heh_hash_inv(struct ablkcipher_request *req, const be128 *beta_key) */ memset(&tmp, 0, sizeof(tmp)); scatterwalk_map_and_copy(&tmp, tail_sgl, 0, HEH_BLOCK_SIZE, 1); - tmp = poly_hash(tfm, sgl, len); + err = poly_hash(req, sgl, &tmp); + if (err) + return err; be128_xor(&tmp, &tmp, &hash); scatterwalk_map_and_copy(&tmp, tail_sgl, 0, HEH_BLOCK_SIZE, 1); return 0; @@ -522,8 +621,6 @@ static int heh_ecb(struct ablkcipher_request *req, bool decrypt) static int heh_crypt(struct ablkcipher_request *req, bool decrypt) { - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct heh_tfm_ctx *ctx = crypto_ablkcipher_ctx(tfm); struct heh_req_ctx *rctx = heh_req_ctx(req); int err; @@ -531,9 +628,6 @@ static int heh_crypt(struct ablkcipher_request *req, bool decrypt) if (req->nbytes < HEH_BLOCK_SIZE) return -EINVAL; - /* Key must have been set */ - if (!ctx->tau_key) - return -ENOKEY; err = generate_betas(req, &rctx->beta1_key, &rctx->beta2_key); if (err) return err; @@ -602,11 +696,8 @@ static int heh_setkey(struct crypto_ablkcipher *parent, const u8 *key, memcpy(derived_keys + i, digest, HEH_BLOCK_SIZE); } - if (ctx->tau_key) - gf128mul_free_4k(ctx->tau_key); - err = -ENOMEM; - ctx->tau_key = gf128mul_init_4k_ble((const be128 *)derived_keys); - if (!ctx->tau_key) + err = crypto_shash_setkey(ctx->poly_hash, derived_keys, HEH_BLOCK_SIZE); + if (err) goto out; crypto_ablkcipher_clear_flags(ecb, CRYPTO_TFM_REQ_MASK); @@ -627,6 +718,7 @@ static int heh_init_tfm(struct crypto_tfm *tfm) struct heh_instance_ctx *ictx = crypto_instance_ctx(inst); struct heh_tfm_ctx *ctx = crypto_tfm_ctx(tfm); struct crypto_shash *cmac; + struct crypto_shash *poly_hash; struct crypto_ablkcipher *ecb; unsigned int reqsize; int err; @@ -635,25 +727,37 @@ static int heh_init_tfm(struct crypto_tfm *tfm) if (IS_ERR(cmac)) return PTR_ERR(cmac); + poly_hash = crypto_spawn_shash(&ictx->poly_hash); + err = PTR_ERR(poly_hash); + if (IS_ERR(poly_hash)) + goto err_free_cmac; + ecb = crypto_spawn_skcipher(&ictx->ecb); err = PTR_ERR(ecb); if (IS_ERR(ecb)) - goto err_free_cmac; + goto err_free_poly_hash; ctx->cmac = cmac; + ctx->poly_hash = poly_hash; ctx->ecb = ecb; reqsize = crypto_tfm_alg_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); - reqsize += max(offsetof(struct heh_req_ctx, u.cmac.desc) + - sizeof(struct shash_desc) + - crypto_shash_descsize(cmac), - offsetof(struct heh_req_ctx, u.ecb.req) + - sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(ecb)); + reqsize += max3(offsetof(struct heh_req_ctx, u.cmac.desc) + + sizeof(struct shash_desc) + + crypto_shash_descsize(cmac), + offsetof(struct heh_req_ctx, u.poly_hash.desc) + + sizeof(struct shash_desc) + + crypto_shash_descsize(poly_hash), + offsetof(struct heh_req_ctx, u.ecb.req) + + sizeof(struct ablkcipher_request) + + crypto_ablkcipher_reqsize(ecb)); tfm->crt_ablkcipher.reqsize = reqsize; + return 0; +err_free_poly_hash: + crypto_free_shash(poly_hash); err_free_cmac: crypto_free_shash(cmac); return err; @@ -663,8 +767,8 @@ static void heh_exit_tfm(struct crypto_tfm *tfm) { struct heh_tfm_ctx *ctx = crypto_tfm_ctx(tfm); - gf128mul_free_4k(ctx->tau_key); crypto_free_shash(ctx->cmac); + crypto_free_shash(ctx->poly_hash); crypto_free_ablkcipher(ctx->ecb); } @@ -673,6 +777,7 @@ static void heh_free_instance(struct crypto_instance *inst) struct heh_instance_ctx *ctx = crypto_instance_ctx(inst); crypto_drop_shash(&ctx->cmac); + crypto_drop_shash(&ctx->poly_hash); crypto_drop_skcipher(&ctx->ecb); kfree(inst); } @@ -689,12 +794,13 @@ static void heh_free_instance(struct crypto_instance *inst) */ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb, const char *full_name, const char *cmac_name, - const char *ecb_name) + const char *poly_hash_name, const char *ecb_name) { struct crypto_attr_type *algt; struct crypto_instance *inst; struct heh_instance_ctx *ctx; struct shash_alg *cmac; + struct shash_alg *poly_hash; struct crypto_alg *ecb; int err; @@ -713,10 +819,9 @@ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb, ctx = crypto_instance_ctx(inst); - /* Set up the cmac and ecb spawns */ - + /* Set up the cmac spawn */ ctx->cmac.base.inst = inst; - err = crypto_grab_shash(&ctx->cmac, cmac_name, 0, CRYPTO_ALG_ASYNC); + err = crypto_grab_shash(&ctx->cmac, cmac_name, 0, 0); if (err) goto err_free_inst; cmac = crypto_spawn_shash_alg(&ctx->cmac); @@ -724,12 +829,23 @@ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb, if (cmac->digestsize != HEH_BLOCK_SIZE) goto err_drop_cmac; + /* Set up the poly_hash spawn */ + ctx->poly_hash.base.inst = inst; + err = crypto_grab_shash(&ctx->poly_hash, poly_hash_name, 0, 0); + if (err) + goto err_drop_cmac; + poly_hash = crypto_spawn_shash_alg(&ctx->poly_hash); + err = -EINVAL; + if (poly_hash->digestsize != HEH_BLOCK_SIZE) + goto err_drop_poly_hash; + + /* Set up the ecb spawn */ ctx->ecb.base.inst = inst; err = crypto_grab_skcipher(&ctx->ecb, ecb_name, 0, crypto_requires_sync(algt->type, algt->mask)); if (err) - goto err_drop_cmac; + goto err_drop_poly_hash; ecb = crypto_skcipher_spawn_alg(&ctx->ecb); /* HEH only supports block ciphers with 16 byte block size */ @@ -750,7 +866,8 @@ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb, /* Set the instance names */ err = -ENAMETOOLONG; if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, - "heh_base(%s,%s)", cmac->base.cra_driver_name, + "heh_base(%s,%s,%s)", cmac->base.cra_driver_name, + poly_hash->base.cra_driver_name, ecb->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_drop_ecb; @@ -762,8 +879,7 @@ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb, /* Finish initializing the instance */ inst->alg.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | - ((cmac->base.cra_flags | ecb->cra_flags) & - CRYPTO_ALG_ASYNC); + (ecb->cra_flags & CRYPTO_ALG_ASYNC); inst->alg.cra_blocksize = HEH_BLOCK_SIZE; inst->alg.cra_ctxsize = sizeof(struct heh_tfm_ctx); inst->alg.cra_alignmask = ecb->cra_alignmask | (__alignof__(be128) - 1); @@ -792,6 +908,8 @@ static int heh_create_common(struct crypto_template *tmpl, struct rtattr **tb, err_drop_ecb: crypto_drop_skcipher(&ctx->ecb); +err_drop_poly_hash: + crypto_drop_shash(&ctx->poly_hash); err_drop_cmac: crypto_drop_shash(&ctx->cmac); err_free_inst: @@ -823,7 +941,8 @@ static int heh_create(struct crypto_template *tmpl, struct rtattr **tb) CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; - return heh_create_common(tmpl, tb, full_name, cmac_name, ecb_name); + return heh_create_common(tmpl, tb, full_name, cmac_name, "poly_hash", + ecb_name); } static struct crypto_template heh_tmpl = { @@ -837,26 +956,34 @@ static int heh_base_create(struct crypto_template *tmpl, struct rtattr **tb) { char full_name[CRYPTO_MAX_ALG_NAME]; const char *cmac_name; + const char *poly_hash_name; const char *ecb_name; cmac_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(cmac_name)) return PTR_ERR(cmac_name); - ecb_name = crypto_attr_alg_name(tb[2]); + poly_hash_name = crypto_attr_alg_name(tb[2]); + if (IS_ERR(poly_hash_name)) + return PTR_ERR(poly_hash_name); + + ecb_name = crypto_attr_alg_name(tb[3]); if (IS_ERR(ecb_name)) return PTR_ERR(ecb_name); - if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "heh_base(%s,%s)", - cmac_name, ecb_name) >= CRYPTO_MAX_ALG_NAME) + if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "heh_base(%s,%s,%s)", + cmac_name, poly_hash_name, ecb_name) >= + CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; - return heh_create_common(tmpl, tb, full_name, cmac_name, ecb_name); + return heh_create_common(tmpl, tb, full_name, cmac_name, poly_hash_name, + ecb_name); } /* * If HEH is instantiated as "heh_base" instead of "heh", then specific - * implementations of cmac and ecb can be specified instead of just the cipher + * implementations of cmac, poly_hash, and ecb can be specified instead of just + * the cipher. */ static struct crypto_template heh_base_tmpl = { .name = "heh_base", @@ -877,8 +1004,14 @@ static int __init heh_module_init(void) if (err) goto out_undo_heh; + err = crypto_register_shash(&poly_hash_alg); + if (err) + goto out_undo_heh_base; + return 0; +out_undo_heh_base: + crypto_unregister_template(&heh_base_tmpl); out_undo_heh: crypto_unregister_template(&heh_tmpl); return err; @@ -888,6 +1021,7 @@ static void __exit heh_module_exit(void) { crypto_unregister_template(&heh_tmpl); crypto_unregister_template(&heh_base_tmpl); + crypto_unregister_shash(&poly_hash_alg); } module_init(heh_module_init); From 0223de3a24eff401c8eafb27055ad5fc290f2808 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 10 Jan 2017 18:32:19 -0800 Subject: [PATCH 0665/3220] ANDROID: arm64/crypto: add ARMv8-CE optimized poly_hash algorithm poly_hash is part of the HEH (Hash-Encrypt-Hash) encryption mode, proposed in Internet Draft https://tools.ietf.org/html/draft-cope-heh-01. poly_hash is very similar to GHASH; besides the swapping of the last two coefficients which we opted to handle in the HEH template, poly_hash just uses a different finite field representation. As with GHASH, poly_hash becomes much faster and more secure against timing attacks when implemented using carryless multiplication instructions instead of tables. This patch adds an ARMv8-CE optimized version of poly_hash, based roughly on the existing ARMv8-CE optimized version of GHASH. Benchmark results are shown below, but note that the resistance to timing attacks may be even more important than the performance gain. poly_hash only: poly_hash-generic: 1,000,000 setkey() takes 1185 ms hashing is 328 MB/s poly_hash-ce: 1,000,000 setkey() takes 8 ms hashing is 1756 MB/s heh(aes) with 4096-byte inputs (this is the ideal case, as the improvement is less significant with smaller inputs): encryption with "heh_base(cmac(aes-ce),poly_hash-generic,ecb-aes-ce)": 118 MB/s decryption with "heh_base(cmac(aes-ce),poly_hash-generic,ecb-aes-ce)": 120 MB/s encryption with "heh_base(cmac(aes-ce),poly_hash-ce,ecb-aes-ce)": 291 MB/s decryption with "heh_base(cmac(aes-ce),poly_hash-ce,ecb-aes-ce)": 293 MB/s Bug: 32508661 Signed-off-by: Eric Biggers Change-Id: I621ec0e1115df7e6f5cbd7e864a4a9d8d2e94cf2 --- arch/arm64/crypto/Kconfig | 5 + arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/poly-hash-ce-core.S | 163 +++++++++++++++++++++++++ arch/arm64/crypto/poly-hash-ce-glue.c | 166 ++++++++++++++++++++++++++ crypto/Kconfig | 1 + 5 files changed, 338 insertions(+) create mode 100644 arch/arm64/crypto/poly-hash-ce-core.S create mode 100644 arch/arm64/crypto/poly-hash-ce-glue.c diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 2cf32e9887e1..de1aab4b5da8 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -23,6 +23,11 @@ config CRYPTO_GHASH_ARM64_CE depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_HASH +config CRYPTO_POLY_HASH_ARM64_CE + tristate "poly_hash (for HEH encryption mode) using ARMv8 Crypto Extensions" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_HASH + config CRYPTO_AES_ARM64_CE tristate "AES core cipher using ARMv8 Crypto Extensions" depends on ARM64 && KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index abb79b3cfcfe..f0a8f2475ea3 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -17,6 +17,9 @@ sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o +obj-$(CONFIG_CRYPTO_POLY_HASH_ARM64_CE) += poly-hash-ce.o +poly-hash-ce-y := poly-hash-ce-glue.o poly-hash-ce-core.o + obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto diff --git a/arch/arm64/crypto/poly-hash-ce-core.S b/arch/arm64/crypto/poly-hash-ce-core.S new file mode 100644 index 000000000000..8ccb544c5526 --- /dev/null +++ b/arch/arm64/crypto/poly-hash-ce-core.S @@ -0,0 +1,163 @@ +/* + * Accelerated poly_hash implementation with ARMv8 PMULL instructions. + * + * Based on ghash-ce-core.S. + * + * Copyright (C) 2014 Linaro Ltd. + * Copyright (C) 2017 Google, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include + + KEY .req v0 + KEY2 .req v1 + T1 .req v2 + T2 .req v3 + GSTAR .req v4 + XL .req v5 + XM .req v6 + XH .req v7 + + .text + .arch armv8-a+crypto + + /* 16-byte aligned (2**4 = 16); not required, but might as well */ + .align 4 +.Lgstar: + .quad 0x87, 0x87 + +/* + * void pmull_poly_hash_update(le128 *digest, const le128 *key, + * const u8 *src, unsigned int blocks, + * unsigned int partial); + */ +ENTRY(pmull_poly_hash_update) + + /* Load digest into XL */ + ld1 {XL.16b}, [x0] + + /* Load key into KEY */ + ld1 {KEY.16b}, [x1] + + /* Load g*(x) = g(x) + x^128 = x^7 + x^2 + x + 1 into both halves of + * GSTAR */ + adr x1, .Lgstar + ld1 {GSTAR.2d}, [x1] + + /* Set KEY2 to (KEY[1]+KEY[0]):(KEY[1]+KEY[0]). This is needed for + * Karatsuba multiplication. */ + ext KEY2.16b, KEY.16b, KEY.16b, #8 + eor KEY2.16b, KEY2.16b, KEY.16b + + /* If 'partial' is nonzero, then we're finishing a pending block and + * should go right to the multiplication. */ + cbnz w4, 1f + +0: + /* Add the next block from 'src' to the digest */ + ld1 {T1.16b}, [x2], #16 + eor XL.16b, XL.16b, T1.16b + sub w3, w3, #1 + +1: + /* + * Multiply the current 128-bit digest (a1:a0, in XL) by the 128-bit key + * (b1:b0, in KEY) using Karatsuba multiplication. + */ + + /* T1 = (a1+a0):(a1+a0) */ + ext T1.16b, XL.16b, XL.16b, #8 + eor T1.16b, T1.16b, XL.16b + + /* XH = a1 * b1 */ + pmull2 XH.1q, XL.2d, KEY.2d + + /* XL = a0 * b0 */ + pmull XL.1q, XL.1d, KEY.1d + + /* XM = (a1+a0) * (b1+b0) */ + pmull XM.1q, T1.1d, KEY2.1d + + /* XM += (XH[0]:XL[1]) + XL + XH */ + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, XL.16b, XH.16b + eor XM.16b, XM.16b, T1.16b + eor XM.16b, XM.16b, T2.16b + + /* + * Now the 256-bit product is in XH[1]:XM:XL[0]. It represents a + * polynomial over GF(2) with degree as large as 255. We need to + * compute its remainder modulo g(x) = x^128+x^7+x^2+x+1. For this it + * is sufficient to compute the remainder of the high half 'c(x)x^128' + * add it to the low half. To reduce the high half we use the Barrett + * reduction method. The basic idea is that we can express the + * remainder p(x) as g(x)q(x) mod x^128, where q(x) = (c(x)x^128)/g(x). + * As detailed in [1], to avoid having to divide by g(x) at runtime the + * following equivalent expression can be derived: + * + * p(x) = [ g*(x)((c(x)q+(x))/x^128) ] mod x^128 + * + * where g*(x) = x^128+g(x) = x^7+x^2+x+1, and q+(x) = x^256/g(x) = g(x) + * in this case. This is also equivalent to: + * + * p(x) = [ g*(x)((c(x)(x^128 + g*(x)))/x^128) ] mod x^128 + * = [ g*(x)(c(x) + (c(x)g*(x))/x^128) ] mod x^128 + * + * Since deg g*(x) < 64: + * + * p(x) = [ g*(x)(c(x) + ((c(x)/x^64)g*(x))/x^64) ] mod x^128 + * = [ g*(x)((c(x)/x^64)x^64 + (c(x) mod x^64) + + * ((c(x)/x^64)g*(x))/x^64) ] mod x^128 + * + * Letting t(x) = g*(x)(c(x)/x^64): + * + * p(x) = [ t(x)x^64 + g*(x)((c(x) mod x^64) + t(x)/x^64) ] mod x^128 + * + * Therefore, to do the reduction we only need to issue two 64-bit => + * 128-bit carryless multiplications: g*(x) times c(x)/x^64, and g*(x) + * times ((c(x) mod x^64) + t(x)/x^64). (Multiplication by x^64 doesn't + * count since it is simply a shift or move.) + * + * An alternate reduction method, also based on Barrett reduction and + * described in [1], uses only shifts and XORs --- no multiplications. + * However, the method with multiplications requires fewer instructions + * and is faster on processors with fast carryless multiplication. + * + * [1] "Intel Carry-Less Multiplication Instruction and its Usage for + * Computing the GCM Mode", + * https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf + */ + + /* 256-bit product is XH[1]:XM:XL[0], so c(x) is XH[1]:XM[1] */ + + /* T1 = t(x) = g*(x)(c(x)/x^64) */ + pmull2 T1.1q, GSTAR.2d, XH.2d + + /* T2 = g*(x)((c(x) mod x^64) + t(x)/x^64) */ + eor T2.16b, XM.16b, T1.16b + pmull2 T2.1q, GSTAR.2d, T2.2d + + /* Make XL[0] be the low half of the 128-bit result by adding the low 64 + * bits of the T2 term to what was already there. The 't(x)x^64' term + * makes no difference, so skip it. */ + eor XL.16b, XL.16b, T2.16b + + /* Make XL[1] be the high half of the 128-bit result by adding the high + * 64 bits of the 't(x)x^64' and T2 terms to what was already in XM[0], + * then moving XM[0] to XL[1]. */ + eor XM.16b, XM.16b, T1.16b + ext T2.16b, T2.16b, T2.16b, #8 + eor XM.16b, XM.16b, T2.16b + mov XL.d[1], XM.d[0] + + /* If more blocks remain, then loop back to process the next block; + * else, store the digest and return. */ + cbnz w3, 0b + st1 {XL.16b}, [x0] + ret +ENDPROC(pmull_poly_hash_update) diff --git a/arch/arm64/crypto/poly-hash-ce-glue.c b/arch/arm64/crypto/poly-hash-ce-glue.c new file mode 100644 index 000000000000..e195740c9ecf --- /dev/null +++ b/arch/arm64/crypto/poly-hash-ce-glue.c @@ -0,0 +1,166 @@ +/* + * Accelerated poly_hash implementation with ARMv8 PMULL instructions. + * + * Based on ghash-ce-glue.c. + * + * poly_hash is part of the HEH (Hash-Encrypt-Hash) encryption mode, proposed in + * Internet Draft https://tools.ietf.org/html/draft-cope-heh-01. + * + * poly_hash is very similar to GHASH: both algorithms are keyed hashes which + * interpret their input data as coefficients of a polynomial over GF(2^128), + * then calculate a hash value by evaluating that polynomial at the point given + * by the key, e.g. using Horner's rule. The difference is that poly_hash uses + * the more natural "ble" convention to represent GF(2^128) elements, whereas + * GHASH uses the less natural "lle" convention (see include/crypto/gf128mul.h). + * The ble convention makes it simpler to implement GF(2^128) multiplication. + * + * Copyright (C) 2014 Linaro Ltd. + * Copyright (C) 2017 Google Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Note: in this algorithm we currently use 'le128' to represent GF(2^128) + * elements, even though poly_hash-generic uses 'be128'. Both types are + * actually "wrong" because the elements are actually in 'ble' format, and there + * should be a ble type to represent this --- as well as lle, bbe, and lbe types + * for the other conventions for representing GF(2^128) elements. But + * practically it doesn't matter which type we choose here, so we just use le128 + * since it's arguably more accurate, while poly_hash-generic still has to use + * be128 because the generic GF(2^128) multiplication functions all take be128. + */ + +struct poly_hash_desc_ctx { + le128 digest; + unsigned int count; +}; + +asmlinkage void pmull_poly_hash_update(le128 *digest, const le128 *key, + const u8 *src, unsigned int blocks, + unsigned int partial); + +static int poly_hash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + if (keylen != sizeof(le128)) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(crypto_shash_ctx(tfm), key, sizeof(le128)); + return 0; +} + +static int poly_hash_init(struct shash_desc *desc) +{ + struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->digest = (le128) { 0 }; + ctx->count = 0; + return 0; +} + +static int poly_hash_update(struct shash_desc *desc, const u8 *src, + unsigned int len) +{ + struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % sizeof(le128); + u8 *dst = (u8 *)&ctx->digest + partial; + + ctx->count += len; + + /* Finishing at least one block? */ + if (partial + len >= sizeof(le128)) { + const le128 *key = crypto_shash_ctx(desc->tfm); + + if (partial) { + /* Finish the pending block. */ + unsigned int n = sizeof(le128) - partial; + + len -= n; + do { + *dst++ ^= *src++; + } while (--n); + } + + /* + * Do the real work. If 'partial' is nonzero, this starts by + * multiplying 'digest' by 'key'. Then for each additional full + * block it adds the block to 'digest' and multiplies by 'key'. + */ + kernel_neon_begin_partial(8); + pmull_poly_hash_update(&ctx->digest, key, src, + len / sizeof(le128), partial); + kernel_neon_end(); + + src += len - (len % sizeof(le128)); + len %= sizeof(le128); + dst = (u8 *)&ctx->digest; + } + + /* Continue adding the next block to 'digest'. */ + while (len--) + *dst++ ^= *src++; + return 0; +} + +static int poly_hash_final(struct shash_desc *desc, u8 *out) +{ + struct poly_hash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % sizeof(le128); + + /* Finish the last block if needed. */ + if (partial) { + const le128 *key = crypto_shash_ctx(desc->tfm); + + kernel_neon_begin_partial(8); + pmull_poly_hash_update(&ctx->digest, key, NULL, 0, partial); + kernel_neon_end(); + } + + memcpy(out, &ctx->digest, sizeof(le128)); + return 0; +} + +static struct shash_alg poly_hash_alg = { + .digestsize = sizeof(le128), + .init = poly_hash_init, + .update = poly_hash_update, + .final = poly_hash_final, + .setkey = poly_hash_setkey, + .descsize = sizeof(struct poly_hash_desc_ctx), + .base = { + .cra_name = "poly_hash", + .cra_driver_name = "poly_hash-ce", + .cra_priority = 300, + .cra_ctxsize = sizeof(le128), + .cra_module = THIS_MODULE, + }, +}; + +static int __init poly_hash_ce_mod_init(void) +{ + return crypto_register_shash(&poly_hash_alg); +} + +static void __exit poly_hash_ce_mod_exit(void) +{ + crypto_unregister_shash(&poly_hash_alg); +} + +MODULE_DESCRIPTION("Polynomial evaluation hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_LICENSE("GPL v2"); + +module_cpu_feature_match(PMULL, poly_hash_ce_mod_init); +module_exit(poly_hash_ce_mod_exit); diff --git a/crypto/Kconfig b/crypto/Kconfig index 627227f1162d..3240d394426c 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -295,6 +295,7 @@ config CRYPTO_HEH select CRYPTO_ECB select CRYPTO_GF128MUL select CRYPTO_MANAGER + select CRYPTO_POLY_HASH_ARM64_CE if ARM64 && KERNEL_MODE_NEON help HEH: Hash-Encrypt-Hash mode HEH is a proposed block cipher mode of operation which extends the From 3e0dd6ec69beb4748a2fc93b5140da2248693736 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 10 Jan 2017 17:02:39 -0800 Subject: [PATCH 0666/3220] ANDROID: ext4: allow encrypting filenames using HEH algorithm Update ext4 encryption to allow filenames to be encrypted using the Hash-Encrypt-Hash (HEH) block cipher mode of operation, which is believed to be more secure than CBC, particularly within the constant initialization vector (IV) constraint of filename encryption. Notably, HEH avoids the "common prefix" problem of CBC. Both algorithms use AES-256 as the underlying block cipher and take a 256-bit key. We assign mode number 126 to HEH, just below 127 (EXT4_ENCRYPTION_MODE_PRIVATE) which in some kernels is reserved for inline encryption on MSM chipsets. Note that these modes are not yet upstream, which is why these numbers are being used; it's preferable to avoid collisions with modes that may be added upstream. Also, although HEH is not hardware-specific, we aren't currently reserving mode number 5 for HEH upstream, since for now we are tying HEH to the new key derivation method which might become an independent flag upstream, and there's also a chance that details of HEH will change after it gets wider review. Bug: 32975945 Signed-off-by: Eric Biggers Change-Id: I81418709d47da0e0ac607ae3f91088063c2d5dd4 --- fs/ext4/Kconfig | 1 + fs/ext4/crypto_fname.c | 3 ++- fs/ext4/crypto_key.c | 3 +++ fs/ext4/ext4.h | 1 + fs/ext4/ext4_crypto.h | 3 +++ 5 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index b46e9fc64196..3c8293215603 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -106,6 +106,7 @@ config EXT4_ENCRYPTION select CRYPTO_ECB select CRYPTO_XTS select CRYPTO_CTS + select CRYPTO_HEH select CRYPTO_CTR select CRYPTO_SHA256 select KEYS diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 2fbef8a14760..e2645ca9b95e 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -44,7 +44,8 @@ static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res) bool ext4_valid_filenames_enc_mode(uint32_t mode) { - return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); + return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS || + mode == EXT4_ENCRYPTION_MODE_AES_256_HEH); } static unsigned max_name_len(struct inode *inode) diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index c5882b36e558..3600dbf4e971 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -182,6 +182,9 @@ retry: case EXT4_ENCRYPTION_MODE_AES_256_CTS: cipher_str = "cts(cbc(aes))"; break; + case EXT4_ENCRYPTION_MODE_AES_256_HEH: + cipher_str = "heh(aes)"; + break; default: printk_once(KERN_WARNING "ext4: unsupported key mode %d (ino %u)\n", diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5cf6d8be48dd..bd135cfe7927 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -588,6 +588,7 @@ enum { #define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 #define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 #define EXT4_ENCRYPTION_MODE_AES_256_CTS 4 +#define EXT4_ENCRYPTION_MODE_AES_256_HEH 126 #include "ext4_crypto.h" diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index ac7d4e813796..41080095b1b7 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -60,6 +60,7 @@ struct ext4_encryption_context { #define EXT4_AES_256_GCM_KEY_SIZE 32 #define EXT4_AES_256_CBC_KEY_SIZE 32 #define EXT4_AES_256_CTS_KEY_SIZE 32 +#define EXT4_AES_256_HEH_KEY_SIZE 32 #define EXT4_AES_256_XTS_KEY_SIZE 64 #define EXT4_MAX_KEY_SIZE 64 @@ -121,6 +122,8 @@ static inline int ext4_encryption_key_size(int mode) return EXT4_AES_256_CBC_KEY_SIZE; case EXT4_ENCRYPTION_MODE_AES_256_CTS: return EXT4_AES_256_CTS_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_HEH: + return EXT4_AES_256_HEH_KEY_SIZE; default: BUG(); } From a425a70b2627c5d429c875619d9d42a39bf0f7dd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 10 Jan 2017 17:02:40 -0800 Subject: [PATCH 0667/3220] ANDROID: ext4: add a non-reversible key derivation method Add a new per-file key derivation method to ext4 encryption defined as: derived_key[0:127] = AES-256-ENCRYPT(master_key[0:255], nonce) derived_key[128:255] = AES-256-ENCRYPT(master_key[0:255], nonce ^ 0x01) derived_key[256:383] = AES-256-ENCRYPT(master_key[256:511], nonce) derived_key[384:511] = AES-256-ENCRYPT(master_key[256:511], nonce ^ 0x01) ... where the derived key and master key are both 512 bits, the nonce is 128 bits, AES-256-ENCRYPT takes the arguments (key, plaintext), and 'nonce ^ 0x01' denotes flipping the low order bit of the last byte. The existing key derivation method is 'derived_key = AES-128-ECB-ENCRYPT(key=nonce, plaintext=master_key)'. We want to make this change because currently, given a derived key you can easily compute the master key by computing 'AES-128-ECB-DECRYPT(key=nonce, ciphertext=derived_key)'. This was formerly OK because the previous threat model assumed that the master key and derived keys are equally hard to obtain by an attacker. However, we are looking to move the master key into secure hardware in some cases, so we want to make sure that an attacker with access to a derived key cannot compute the master key. We are doing this instead of increasing the nonce to 512 bits because it's important that the per-file xattr fit in the inode itself. By default, inodes are 256 bytes, and on Android we're already pretty close to that limit. If we increase the nonce size, we end up allocating a new filesystem block for each and every encrypted file, which has a substantial performance and disk utilization impact. Another option considered was to use the HMAC-SHA512 of the nonce, keyed by the master key. However this would be a little less performant, would be less extensible to other key sizes and MAC algorithms, and would pull in a dependency (security-wise and code-wise) on SHA-512. Due to the use of "aes" rather than "ecb(aes)" in the implementation, the new key derivation method is actually about twice as fast as the old one, though the old one could be optimized similarly as well. This patch makes the new key derivation method be used whenever HEH is used to encrypt filenames. Although these two features are logically independent, it was decided to bundle them together for now. Note that neither feature is upstream yet, and it cannot be guaranteed that the on-disk format won't change if/when these features are upstreamed. For this reason, and as noted in the previous patch, the features are both behind a special mode number for now. Signed-off-by: Eric Biggers Change-Id: Iee4113f57e59dc8c0b7dc5238d7003c83defb986 --- fs/ext4/crypto_key.c | 98 +++++++++++++++++++++++++++++++++++++++---- fs/ext4/ext4_crypto.h | 1 + 2 files changed, 92 insertions(+), 7 deletions(-) diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 3600dbf4e971..776c51beff35 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -29,16 +29,16 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) } /** - * ext4_derive_key_aes() - Derive a key using AES-128-ECB + * ext4_derive_key_v1() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. * @derived_key: Derived key. * - * Return: Zero on success; non-zero otherwise. + * Return: 0 on success, -errno on failure */ -static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], - char source_key[EXT4_AES_256_XTS_KEY_SIZE], - char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) +static int ext4_derive_key_v1(const char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], + const char source_key[EXT4_AES_256_XTS_KEY_SIZE], + char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) { int res = 0; struct ablkcipher_request *req = NULL; @@ -83,6 +83,91 @@ out: return res; } +/** + * ext4_derive_key_v2() - Derive a key non-reversibly + * @nonce: the nonce associated with the file + * @master_key: the master key referenced by the file + * @derived_key: (output) the resulting derived key + * + * This function computes the following: + * derived_key[0:127] = AES-256-ENCRYPT(master_key[0:255], nonce) + * derived_key[128:255] = AES-256-ENCRYPT(master_key[0:255], nonce ^ 0x01) + * derived_key[256:383] = AES-256-ENCRYPT(master_key[256:511], nonce) + * derived_key[384:511] = AES-256-ENCRYPT(master_key[256:511], nonce ^ 0x01) + * + * 'nonce ^ 0x01' denotes flipping the low order bit of the last byte. + * + * Unlike the v1 algorithm, the v2 algorithm is "non-reversible", meaning that + * compromising a derived key does not also compromise the master key. + * + * Return: 0 on success, -errno on failure + */ +static int ext4_derive_key_v2(const char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE], + const char master_key[EXT4_MAX_KEY_SIZE], + char derived_key[EXT4_MAX_KEY_SIZE]) +{ + const int noncelen = EXT4_KEY_DERIVATION_NONCE_SIZE; + struct crypto_cipher *tfm; + int err; + int i; + + /* + * Since we only use each transform for a small number of encryptions, + * requesting just "aes" turns out to be significantly faster than + * "ecb(aes)", by about a factor of two. + */ + tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + BUILD_BUG_ON(4 * EXT4_KEY_DERIVATION_NONCE_SIZE != EXT4_MAX_KEY_SIZE); + BUILD_BUG_ON(2 * EXT4_AES_256_ECB_KEY_SIZE != EXT4_MAX_KEY_SIZE); + for (i = 0; i < 2; i++) { + memcpy(derived_key, nonce, noncelen); + memcpy(derived_key + noncelen, nonce, noncelen); + derived_key[2 * noncelen - 1] ^= 0x01; + err = crypto_cipher_setkey(tfm, master_key, + EXT4_AES_256_ECB_KEY_SIZE); + if (err) + break; + crypto_cipher_encrypt_one(tfm, derived_key, derived_key); + crypto_cipher_encrypt_one(tfm, derived_key + noncelen, + derived_key + noncelen); + master_key += EXT4_AES_256_ECB_KEY_SIZE; + derived_key += 2 * noncelen; + } + crypto_free_cipher(tfm); + return err; +} + +/** + * ext4_derive_key() - Derive a per-file key from a nonce and master key + * @ctx: the encryption context associated with the file + * @master_key: the master key referenced by the file + * @derived_key: (output) the resulting derived key + * + * Return: 0 on success, -errno on failure + */ +static int ext4_derive_key(const struct ext4_encryption_context *ctx, + const char master_key[EXT4_MAX_KEY_SIZE], + char derived_key[EXT4_MAX_KEY_SIZE]) +{ + BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != EXT4_KEY_DERIVATION_NONCE_SIZE); + BUILD_BUG_ON(EXT4_AES_256_XTS_KEY_SIZE != EXT4_MAX_KEY_SIZE); + + /* + * Although the key derivation algorithm is logically independent of the + * choice of encryption modes, in this kernel it is bundled with HEH + * encryption of filenames, which is another crypto improvement that + * requires an on-disk format change and requires userspace to specify + * different encryption policies. + */ + if (ctx->filenames_encryption_mode == EXT4_ENCRYPTION_MODE_AES_256_HEH) + return ext4_derive_key_v2(ctx->nonce, master_key, derived_key); + else + return ext4_derive_key_v1(ctx->nonce, master_key, derived_key); +} + void ext4_free_crypt_info(struct ext4_crypt_info *ci) { if (!ci) @@ -231,8 +316,7 @@ retry: res = -ENOKEY; goto out; } - res = ext4_derive_key_aes(ctx.nonce, master_key->raw, - raw_key); + res = ext4_derive_key(&ctx, master_key->raw, raw_key); if (res) goto out; got_key: diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index 41080095b1b7..5a1684bd083b 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -58,6 +58,7 @@ struct ext4_encryption_context { #define EXT4_XTS_TWEAK_SIZE 16 #define EXT4_AES_128_ECB_KEY_SIZE 16 #define EXT4_AES_256_GCM_KEY_SIZE 32 +#define EXT4_AES_256_ECB_KEY_SIZE 32 #define EXT4_AES_256_CBC_KEY_SIZE 32 #define EXT4_AES_256_CTS_KEY_SIZE 32 #define EXT4_AES_256_HEH_KEY_SIZE 32 From 8b75db9857a49502a9b7156433f1bc57451805c1 Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Mon, 13 Feb 2017 09:22:36 -0800 Subject: [PATCH 0668/3220] ANDROID: ext4 crypto: Disables zeroing on truncation when there's no key When performing orphan cleanup on mount, ext4 may truncate pages. Truncation as currently implemented may require the encryption key for partial zeroing, and the key isn't necessarily available on mount. Since the userspace tools don't perform the partial zeroing operation anyway, let's just skip doing that in the kernel. This patch fixes a BUG_ON() oops. Bug: 35209576 Change-Id: I2527a3f8d2c57d2de5df03fda69ee397f76095d7 Signed-off-by: Michael Halcrow --- fs/ext4/inode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 07037262337a..e2cb00640d1e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3557,6 +3557,11 @@ static int ext4_block_truncate_page(handle_t *handle, unsigned blocksize; struct inode *inode = mapping->host; + /* If we are processing an encrypted inode during orphan list + * handling */ + if (ext4_encrypted_inode(inode) && !ext4_has_encryption_key(inode)) + return 0; + blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); From 56026a89e632af0cf45602dee1b2881bf21c4eba Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 14 Feb 2017 20:47:17 -0800 Subject: [PATCH 0669/3220] ANDROID: sdcardfs: Fix incorrect hash This adds back the hash calculation removed as part of the previous patch, as it is in fact necessary. Signed-off-by: Daniel Rosenberg Bug: 35307857 Change-Id: Ie607332bcf2c5d2efdf924e4060ef3f576bf25dc --- fs/sdcardfs/lookup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 9135866b7766..6b595e892316 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -221,6 +221,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, struct dentry *lower_dentry; const struct qstr *name; struct path lower_path; + struct qstr dname; struct sdcardfs_sb_info *sbi; sbi = SDCARDFS_SB(dentry->d_sb); @@ -306,11 +307,14 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, goto out; /* instatiate a new negative dentry */ - lower_dentry = d_lookup(lower_dir_dentry, name); + dname.name = name->name; + dname.len = name->len; + dname.hash = full_name_hash(dname.name, dname.len); + lower_dentry = d_lookup(lower_dir_dentry, &dname); if (lower_dentry) goto setup_lower; - lower_dentry = d_alloc(lower_dir_dentry, name); + lower_dentry = d_alloc(lower_dir_dentry, &dname); if (!lower_dentry) { err = -ENOMEM; goto out; From 1914b2934b3eb3def0105f5be0b1b98c4169f9ec Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 16 Feb 2017 17:55:22 -0800 Subject: [PATCH 0670/3220] ANDROID: sdcardfs: Add missing path_put "ANDROID: sdcardfs: Add GID Derivation to sdcardfs" introduced an unbalanced pat_get, leading to storage space not being freed after deleting a file until rebooting. This adds the missing path_put. Signed-off-by: Daniel Rosenberg Bug: 34691169 Change-Id: Ia7ef97ec2eca2c555cc06b235715635afc87940e --- fs/sdcardfs/derived_perm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 0bb442338a85..ca239a942065 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -243,6 +243,7 @@ retry_deleg: if (error) pr_err("sdcardfs: Failed to touch up lower fs gid/uid.\n"); } + sdcardfs_put_lower_path(dentry, &path); } static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit) { From 880c68e41dd2eab7bdbc64c8061e5b7e0e269cd3 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 22 Feb 2017 14:41:58 -0800 Subject: [PATCH 0671/3220] ANDROID: sdcardfs: Don't bother deleting freelist There is no point deleting entries from dlist, as that is a temporary list on the stack from which contains only entries that are being deleted. Not all code paths set up dlist, so those that don't were performing invalid accesses in hash_del_rcu. As an additional means to prevent any other issue, we null out the list entries when we allocate from the cache. Signed-off-by: Daniel Rosenberg Bug: 35666680 Change-Id: Ibb1e28c08c3a600c29418d39ba1c0f3db3bf31e5 --- fs/sdcardfs/packagelist.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index d96fcde041cc..56d643f4a9ee 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -178,6 +178,8 @@ static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key, GFP_KERNEL); if (!ret) return NULL; + INIT_HLIST_NODE(&ret->dlist); + INIT_HLIST_NODE(&ret->hlist); if (!qstr_copy(key, &ret->key)) { kmem_cache_free(hashtable_entry_cachep, ret); @@ -326,7 +328,6 @@ static int insert_userid_exclude_entry(const struct qstr *key, userid_t value) static void free_hashtable_entry(struct hashtable_entry *entry) { kfree(entry->key.name); - hash_del_rcu(&entry->dlist); kmem_cache_free(hashtable_entry_cachep, entry); } From 166f168a48d69ae0715983324bc80df8ff2aa4a4 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 24 Feb 2017 15:41:48 -0800 Subject: [PATCH 0672/3220] ANDROID: sdcardfs: implement vm_ops->page_mkwrite This comes from the wrapfs patch 3dfec0ffe5e2 Wrapfs: implement vm_ops->page_mkwrite Some file systems (e.g., ext4) require it. Reported by Ted Ts'o. Signed-off-by: Erez Zadok Signed-off-by: Daniel Rosenberg Bug: 34133558 Change-Id: I1a389b2422c654a6d3046bb8ec3e20511aebfa8e --- fs/sdcardfs/mmap.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c index e21f64675a80..7dd715875ef1 100644 --- a/fs/sdcardfs/mmap.c +++ b/fs/sdcardfs/mmap.c @@ -48,6 +48,39 @@ static int sdcardfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return err; } +static int sdcardfs_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + int err = 0; + struct file *file, *lower_file; + const struct vm_operations_struct *lower_vm_ops; + struct vm_area_struct lower_vma; + + memcpy(&lower_vma, vma, sizeof(struct vm_area_struct)); + file = lower_vma.vm_file; + lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops; + BUG_ON(!lower_vm_ops); + if (!lower_vm_ops->page_mkwrite) + goto out; + + lower_file = sdcardfs_lower_file(file); + /* + * XXX: vm_ops->page_mkwrite may be called in parallel. + * Because we have to resort to temporarily changing the + * vma->vm_file to point to the lower file, a concurrent + * invocation of sdcardfs_page_mkwrite could see a different + * value. In this workaround, we keep a different copy of the + * vma structure in our stack, so we never expose a different + * value of the vma->vm_file called to us, even temporarily. + * A better fix would be to change the calling semantics of + * ->page_mkwrite to take an explicit file pointer. + */ + lower_vma.vm_file = lower_file; + err = lower_vm_ops->page_mkwrite(&lower_vma, vmf); +out: + return err; +} + static ssize_t sdcardfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { @@ -78,4 +111,5 @@ const struct address_space_operations sdcardfs_aops = { const struct vm_operations_struct sdcardfs_vm_ops = { .fault = sdcardfs_fault, + .page_mkwrite = sdcardfs_page_mkwrite, }; From 98c307e68c2887263a0c4321b54c402abf5ef7fc Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 24 Feb 2017 15:49:45 -0800 Subject: [PATCH 0673/3220] ANDROID: sdcardfs: support direct-IO (DIO) operations This comes from the wrapfs patch 2e346c83b26e Wrapfs: support direct-IO (DIO) operations Signed-off-by: Li Mengyang Signed-off-by: Erez Zadok Signed-off-by: Daniel Rosenberg Bug: 34133558 Change-Id: I3fd779c510ab70d56b1d918f99c20421b524cdc4 --- fs/sdcardfs/mmap.c | 21 ++++----------------- fs/sdcardfs/sdcardfs.h | 1 + 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c index 7dd715875ef1..0d4089c62c3a 100644 --- a/fs/sdcardfs/mmap.c +++ b/fs/sdcardfs/mmap.c @@ -85,27 +85,14 @@ static ssize_t sdcardfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { /* - * This function returns zero on purpose in order to support direct IO. - * __dentry_open checks a_ops->direct_IO and returns EINVAL if it is null. - * - * However, this function won't be called by certain file operations - * including generic fs functions. * reads and writes are delivered to - * the lower file systems and the direct IOs will be handled by them. - * - * NOTE: exceptionally, on the recent kernels (since Linux 3.8.x), - * swap_writepage invokes this function directly. + * This function should never be called directly. We need it + * to exist, to get past a check in open_check_o_direct(), + * which is called from do_last(). */ - printk(KERN_INFO "%s, operation is not supported\n", __func__); - return 0; + return -EINVAL; } -/* - * XXX: the default address_space_ops for sdcardfs is empty. We cannot set - * our inode->i_mapping->a_ops to NULL because too many code paths expect - * the a_ops vector to be non-NULL. - */ const struct address_space_operations sdcardfs_aops = { - /* empty on purpose */ .direct_IO = sdcardfs_direct_IO, }; diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index f3cced313108..042f989f0bea 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include From 6696986a932a28b17858279c5eb69f83d566d4de Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 17 Jun 2013 18:36:56 +0100 Subject: [PATCH 0674/3220] cpufreq: interactive governor drops bits in time calculation Keep time calculation in 64-bit throughout. If we have long times between idle calculations this can result in deltas > 32 bits which causes incorrect load percentage calculations and selecting the wrong frequencies if we truncate here. Signed-off-by: Chris Redpath --- drivers/cpufreq/cpufreq_interactive.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index f2929e628820..889c9b8b2237 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -312,13 +312,13 @@ static u64 update_load(int cpu) pcpu->policy->governor_data; u64 now; u64 now_idle; - unsigned int delta_idle; - unsigned int delta_time; + u64 delta_idle; + u64 delta_time; u64 active_time; now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy); - delta_idle = (unsigned int)(now_idle - pcpu->time_in_idle); - delta_time = (unsigned int)(now - pcpu->time_in_idle_timestamp); + delta_idle = (now_idle - pcpu->time_in_idle); + delta_time = (now - pcpu->time_in_idle_timestamp); if (delta_time <= delta_idle) active_time = 0; From 6e1c2455aa286eec7579615883fb5ea7ecac8853 Mon Sep 17 00:00:00 2001 From: Anson Jacob Date: Thu, 17 Nov 2016 02:32:40 -0500 Subject: [PATCH 0675/3220] ANDROID: usb: gadget: function: Fix commenting style Fix checkpatch.pl warning: Block comments use * on subsequent lines Change-Id: I9c92f128fdb3aeeb6ab9c7039e11f857bebb9539 Signed-off-by: Anson Jacob --- drivers/usb/gadget/function/f_accessory.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index 9d3ec0e37475..f2fa0c271d70 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -676,9 +676,10 @@ static ssize_t acc_write(struct file *fp, const char __user *buf, req->zero = 0; } else { xfer = count; - /* If the data length is a multple of the + /* + * If the data length is a multple of the * maxpacket size then send a zero length packet(ZLP). - */ + */ req->zero = ((xfer % dev->ep_in->maxpacket) == 0); } if (copy_from_user(req->buf, buf, xfer)) { @@ -820,11 +821,11 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev, unsigned long flags; /* - printk(KERN_INFO "acc_ctrlrequest " - "%02x.%02x v%04x i%04x l%u\n", - b_requestType, b_request, - w_value, w_index, w_length); -*/ + * printk(KERN_INFO "acc_ctrlrequest " + * "%02x.%02x v%04x i%04x l%u\n", + * b_requestType, b_request, + * w_value, w_index, w_length); + */ if (b_requestType == (USB_DIR_OUT | USB_TYPE_VENDOR)) { if (b_request == ACCESSORY_START) { From f04805218728192d346291c1a0e78cb184142bad Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Mon, 14 Nov 2016 09:48:02 -0800 Subject: [PATCH 0676/3220] ANDROID: dm: android-verity: fix table_make_digest() error handling If table_make_digest() fails, verify_verity_signature() would try to pass the returned ERR_PTR() to kfree(). This fixes the smatch error: drivers/md/dm-android-verity.c:601 verify_verity_signature() error: 'pks' dereferencing possible ERR_PTR() Change-Id: I9b9b7764b538cb4a5f94337660e9b0f149b139be Signed-off-by: Greg Hackmann --- drivers/md/dm-android-verity.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index bb6c1285e499..ec0a4d19ca3e 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -585,6 +585,8 @@ static int verify_verity_signature(char *key_id, if (IS_ERR(pks)) { DMERR("hashing failed"); + retval = PTR_ERR(pks); + pks = NULL; goto error; } From 16f66bfe88211599834fa7109117553835ab4690 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 10 Nov 2016 19:36:15 -0700 Subject: [PATCH 0677/3220] nf: IDLETIMER: Fix use after free condition during work schedule_work(&timer->work) appears to be called after cancel_work_sync(&info->timer->work) is completed. Work can be scheduled from the PM_POST_SUSPEND notification event even after cancel_work_sync is called. Call stack -004|notify_netlink_uevent( | [X19] timer = 0xFFFFFFC0A5DFC780 -> ( | ... | [NSD:0xFFFFFFC0A5DFC800] kobj = 0x6B6B6B6B6B6B6B6B, | [NSD:0xFFFFFFC0A5DFC868] timeout = 0x6B6B6B6B, | [NSD:0xFFFFFFC0A5DFC86C] refcnt = 0x6B6B6B6B, | [NSD:0xFFFFFFC0A5DFC870] work_pending = 0x6B, | [NSD:0xFFFFFFC0A5DFC871] send_nl_msg = 0x6B, | [NSD:0xFFFFFFC0A5DFC872] active = 0x6B, | [NSD:0xFFFFFFC0A5DFC874] uid = 0x6B6B6B6B, | [NSD:0xFFFFFFC0A5DFC878] suspend_time_valid = 0x6B)) -005|idletimer_tg_work( -006|__read_once_size(inline) -006|static_key_count(inline) -006|static_key_false(inline) -006|trace_workqueue_execute_end(inline) -006|process_one_work( -007|worker_thread( -008|kthread( -009|ret_from_fork(asm) ---|end of frame Force any pending idletimer_tg_work() to complete before freeing the associated work struct and after unregistering to the pm_notifier callback. Change-Id: I4c5f0a1c142f7d698c092cf7bcafdb0f9fbaa9c1 Signed-off-by: Subash Abhinov Kasiviswanathan --- net/netfilter/xt_IDLETIMER.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 0975c993a94e..ada5a304e61e 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -456,6 +456,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) del_timer_sync(&info->timer->timer); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); unregister_pm_notifier(&info->timer->pm_nb); + cancel_work_sync(&info->timer->work); kfree(info->timer->attr.attr.name); kfree(info->timer); } else { From 33af4c0c5faa2ae101db8701d1000512a27dfdb7 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 2 Nov 2016 11:56:40 -0600 Subject: [PATCH 0678/3220] nf: IDLETIMER: Use fullsock when querying uid sock_i_uid() acquires the sk_callback_lock which does not exist for sockets in TCP_NEW_SYN_RECV state. This results in errors showing up as spinlock bad magic. Fix this by looking for the full sock as suggested by Eric. Callstack for reference - -003|rwlock_bug -004|arch_read_lock -004|do_raw_read_lock -005|raw_read_lock_bh -006|sock_i_uid -007|from_kuid_munged(inline) -007|reset_timer -008|idletimer_tg_target -009|ipt_do_table -010|iptable_mangle_hook -011|nf_iterate -012|nf_hook_slow -013|NF_HOOK_COND(inline) -013|ip_output -014|ip_local_out -015|ip_build_and_send_pkt -016|tcp_v4_send_synack -017|atomic_sub_return(inline) -017|reqsk_put(inline) -017|tcp_conn_request -018|tcp_v4_conn_request -019|tcp_rcv_state_process -020|tcp_v4_do_rcv -021|tcp_v4_rcv -022|ip_local_deliver_finish -023|NF_HOOK_THRESH(inline) -023|NF_HOOK(inline) -023|ip_local_deliver -024|ip_rcv_finish -025|NF_HOOK_THRESH(inline) -025|NF_HOOK(inline) -025|ip_rcv -026|deliver_skb(inline) -026|deliver_ptype_list_skb(inline) -026|__netif_receive_skb_core -027|__netif_receive_skb -028|netif_receive_skb_internal -029|netif_receive_skb Change-Id: Ic8f3a3d2d7af31434d1163b03971994e2125d552 Signed-off-by: Subash Abhinov Kasiviswanathan Cc: Eric Dumazet --- net/netfilter/xt_IDLETIMER.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index ada5a304e61e..f11aa28b96ce 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -49,6 +49,7 @@ #include #include #include +#include struct idletimer_tg_attr { struct attribute attr; @@ -355,7 +356,7 @@ static void reset_timer(const struct idletimer_tg_info *info, /* Stores the uid resposible for waking up the radio */ if (skb && (skb->sk)) { timer->uid = from_kuid_munged(current_user_ns(), - sock_i_uid(skb->sk)); + sock_i_uid(skb_to_full_sk(skb))); } /* checks if there is a pending inactive notification*/ From 359795138dc5440e09c58025e28ec1b38d648c09 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Tue, 7 Mar 2017 15:51:18 +0100 Subject: [PATCH 0679/3220] binder: use group leader instead of open thread The binder allocator assumes that the thread that called binder_open will never die for the lifetime of that proc. That thread is normally the group_leader, however it may not be. Use the group_leader instead of current. Bug: 35707103 Test: Created test case to open with temporary thread Change-Id: Id693f74b3591f3524a8c6e9508e70f3e5a80c588 Signed-off-by: Todd Kjos Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index e6af2e819980..08cde76875d2 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3359,7 +3359,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) const char *failure_string; struct binder_buffer *buffer; - if (proc->tsk != current) + if (proc->tsk != current->group_leader) return -EINVAL; if ((vma->vm_end - vma->vm_start) > SZ_4M) @@ -3461,8 +3461,8 @@ static int binder_open(struct inode *nodp, struct file *filp) proc = kzalloc(sizeof(*proc), GFP_KERNEL); if (proc == NULL) return -ENOMEM; - get_task_struct(current); - proc->tsk = current; + get_task_struct(current->group_leader); + proc->tsk = current->group_leader; INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->wait); proc->default_priority = task_nice(current); From 0fd0992d350402a927408caf20726307993e0010 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Tue, 7 Mar 2017 15:54:56 +0100 Subject: [PATCH 0680/3220] android: binder: add padding to binder_fd_array_object. binder_fd_array_object starts with a 4-byte header, followed by a few fields that are 8 bytes when ANDROID_BINDER_IPC_32BIT=N. This can cause alignment issues in a 64-bit kernel with a 32-bit userspace, as on x86_32 an 8-byte primitive may be aligned to a 4-byte address. Pad with a __u32 to fix this. Change-Id: I4374ed2cc3ccd3c6a1474cb7209b53ebfd91077b Signed-off-by: Martijn Coenen --- include/uapi/linux/android/binder.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 51f891fb1b18..7668b5791c91 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -132,6 +132,7 @@ enum { /* struct binder_fd_array_object - object describing an array of fds in a buffer * @hdr: common header structure + * @pad: padding to ensure correct alignment * @num_fds: number of file descriptors in the buffer * @parent: index in offset array to buffer holding the fd array * @parent_offset: start offset of fd array in the buffer @@ -152,6 +153,7 @@ enum { */ struct binder_fd_array_object { struct binder_object_header hdr; + __u32 pad; binder_size_t num_fds; binder_size_t parent; binder_size_t parent_offset; From d6bbb3276728e4a13c832c55f948a40a27bc4a33 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 30 Sep 2016 16:40:04 +0200 Subject: [PATCH 0681/3220] android: binder: move global binder state into context struct. This change moves all global binder state into the context struct, thereby completely separating the state and the locks between two different contexts. The debugfs entries remain global, printing entries from all contexts. Change-Id: If8e3e2bece7bc6f974b66fbcf1d91d529ffa62f0 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 392 ++++++++++++++++++++++++++------------- 1 file changed, 259 insertions(+), 133 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 08cde76875d2..9cf4f9bbc711 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -18,6 +18,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -46,19 +47,11 @@ #include #include "binder_trace.h" -static DEFINE_MUTEX(binder_main_lock); -static DEFINE_MUTEX(binder_deferred_lock); -static DEFINE_MUTEX(binder_mmap_lock); - static HLIST_HEAD(binder_devices); -static HLIST_HEAD(binder_procs); -static HLIST_HEAD(binder_deferred_list); -static HLIST_HEAD(binder_dead_nodes); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; -static int binder_last_id; -static struct workqueue_struct *binder_deferred_workqueue; +atomic_t binder_last_id; #define BINDER_DEBUG_ENTRY(name) \ static int binder_##name##_open(struct inode *inode, struct file *file) \ @@ -173,20 +166,24 @@ enum binder_stat_types { struct binder_stats { int br[_IOC_NR(BR_FAILED_REPLY) + 1]; int bc[_IOC_NR(BC_REPLY_SG) + 1]; - int obj_created[BINDER_STAT_COUNT]; - int obj_deleted[BINDER_STAT_COUNT]; }; -static struct binder_stats binder_stats; +/* These are still global, since it's not always easy to get the context */ +struct binder_obj_stats { + atomic_t obj_created[BINDER_STAT_COUNT]; + atomic_t obj_deleted[BINDER_STAT_COUNT]; +}; + +static struct binder_obj_stats binder_obj_stats; static inline void binder_stats_deleted(enum binder_stat_types type) { - binder_stats.obj_deleted[type]++; + atomic_inc(&binder_obj_stats.obj_deleted[type]); } static inline void binder_stats_created(enum binder_stat_types type) { - binder_stats.obj_created[type]++; + atomic_inc(&binder_obj_stats.obj_created[type]); } struct binder_transaction_log_entry { @@ -207,8 +204,6 @@ struct binder_transaction_log { int full; struct binder_transaction_log_entry entry[32]; }; -static struct binder_transaction_log binder_transaction_log; -static struct binder_transaction_log binder_transaction_log_failed; static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_transaction_log *log) @@ -229,6 +224,21 @@ struct binder_context { struct binder_node *binder_context_mgr_node; kuid_t binder_context_mgr_uid; const char *name; + + struct mutex binder_main_lock; + struct mutex binder_deferred_lock; + struct mutex binder_mmap_lock; + + struct hlist_head binder_procs; + struct hlist_head binder_dead_nodes; + struct hlist_head binder_deferred_list; + + struct work_struct deferred_work; + struct workqueue_struct *binder_deferred_workqueue; + struct binder_transaction_log transaction_log; + struct binder_transaction_log transaction_log_failed; + + struct binder_stats binder_stats; }; struct binder_device { @@ -451,17 +461,18 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd) return retval; } -static inline void binder_lock(const char *tag) +static inline void binder_lock(struct binder_context *context, const char *tag) { trace_binder_lock(tag); - mutex_lock(&binder_main_lock); + mutex_lock(&context->binder_main_lock); trace_binder_locked(tag); } -static inline void binder_unlock(const char *tag) +static inline void binder_unlock(struct binder_context *context, + const char *tag) { trace_binder_unlock(tag); - mutex_unlock(&binder_main_lock); + mutex_unlock(&context->binder_main_lock); } static void binder_set_nice(long nice) @@ -946,7 +957,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, binder_stats_created(BINDER_STAT_NODE); rb_link_node(&node->rb_node, parent, p); rb_insert_color(&node->rb_node, &proc->nodes); - node->debug_id = ++binder_last_id; + node->debug_id = atomic_inc_return(&binder_last_id); node->proc = proc; node->ptr = ptr; node->cookie = cookie; @@ -1088,7 +1099,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, if (new_ref == NULL) return NULL; binder_stats_created(BINDER_STAT_REF); - new_ref->debug_id = ++binder_last_id; + new_ref->debug_id = atomic_inc_return(&binder_last_id); new_ref->proc = proc; new_ref->node = node; rb_link_node(&new_ref->rb_node_node, parent, p); @@ -1848,7 +1859,7 @@ static void binder_transaction(struct binder_proc *proc, binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; - e = binder_transaction_log_add(&binder_transaction_log); + e = binder_transaction_log_add(&context->transaction_log); e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); e->from_proc = proc->pid; e->from_thread = thread->pid; @@ -1970,7 +1981,7 @@ static void binder_transaction(struct binder_proc *proc, } binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); - t->debug_id = ++binder_last_id; + t->debug_id = atomic_inc_return(&binder_last_id); e->debug_id = t->debug_id; if (reply) @@ -2234,7 +2245,8 @@ err_no_context_mgr_node: { struct binder_transaction_log_entry *fe; - fe = binder_transaction_log_add(&binder_transaction_log_failed); + fe = binder_transaction_log_add( + &context->transaction_log_failed); *fe = *e; } @@ -2262,8 +2274,8 @@ static int binder_thread_write(struct binder_proc *proc, return -EFAULT; ptr += sizeof(uint32_t); trace_binder_command(cmd); - if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) { - binder_stats.bc[_IOC_NR(cmd)]++; + if (_IOC_NR(cmd) < ARRAY_SIZE(context->binder_stats.bc)) { + context->binder_stats.bc[_IOC_NR(cmd)]++; proc->stats.bc[_IOC_NR(cmd)]++; thread->stats.bc[_IOC_NR(cmd)]++; } @@ -2628,8 +2640,8 @@ static void binder_stat_br(struct binder_proc *proc, struct binder_thread *thread, uint32_t cmd) { trace_binder_return(cmd); - if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) { - binder_stats.br[_IOC_NR(cmd)]++; + if (_IOC_NR(cmd) < ARRAY_SIZE(proc->stats.br)) { + proc->context->binder_stats.br[_IOC_NR(cmd)]++; proc->stats.br[_IOC_NR(cmd)]++; thread->stats.br[_IOC_NR(cmd)]++; } @@ -2693,7 +2705,7 @@ retry: if (wait_for_proc_work) proc->ready_threads++; - binder_unlock(__func__); + binder_unlock(proc->context, __func__); trace_binder_wait_for_work(wait_for_proc_work, !!thread->transaction_stack, @@ -2720,7 +2732,7 @@ retry: ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread)); } - binder_lock(__func__); + binder_lock(proc->context, __func__); if (wait_for_proc_work) proc->ready_threads--; @@ -3107,14 +3119,14 @@ static unsigned int binder_poll(struct file *filp, struct binder_thread *thread = NULL; int wait_for_proc_work; - binder_lock(__func__); + binder_lock(proc->context, __func__); thread = binder_get_thread(proc); wait_for_proc_work = thread->transaction_stack == NULL && list_empty(&thread->todo) && thread->return_error == BR_OK; - binder_unlock(__func__); + binder_unlock(proc->context, __func__); if (wait_for_proc_work) { if (binder_has_proc_work(proc, thread)) @@ -3241,6 +3253,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; struct binder_proc *proc = filp->private_data; + struct binder_context *context = proc->context; struct binder_thread *thread; unsigned int size = _IOC_SIZE(cmd); void __user *ubuf = (void __user *)arg; @@ -3254,7 +3267,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (ret) goto err_unlocked; - binder_lock(__func__); + binder_lock(context, __func__); thread = binder_get_thread(proc); if (thread == NULL) { ret = -ENOMEM; @@ -3306,7 +3319,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err: if (thread) thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN; - binder_unlock(__func__); + binder_unlock(context, __func__); wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret && ret != -ERESTARTSYS) pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); @@ -3378,7 +3391,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) } vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE; - mutex_lock(&binder_mmap_lock); + mutex_lock(&proc->context->binder_mmap_lock); if (proc->buffer) { ret = -EBUSY; failure_string = "already mapped"; @@ -3393,7 +3406,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) } proc->buffer = area->addr; proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer; - mutex_unlock(&binder_mmap_lock); + mutex_unlock(&proc->context->binder_mmap_lock); #ifdef CONFIG_CPU_CACHE_VIPT if (cache_is_vipt_aliasing()) { @@ -3438,12 +3451,12 @@ err_alloc_small_buf_failed: kfree(proc->pages); proc->pages = NULL; err_alloc_pages_failed: - mutex_lock(&binder_mmap_lock); + mutex_lock(&proc->context->binder_mmap_lock); vfree(proc->buffer); proc->buffer = NULL; err_get_vm_area_failed: err_already_mapped: - mutex_unlock(&binder_mmap_lock); + mutex_unlock(&proc->context->binder_mmap_lock); err_bad_arg: pr_err("binder_mmap: %d %lx-%lx %s failed %d\n", proc->pid, vma->vm_start, vma->vm_end, failure_string, ret); @@ -3470,15 +3483,15 @@ static int binder_open(struct inode *nodp, struct file *filp) miscdev); proc->context = &binder_dev->context; - binder_lock(__func__); + binder_lock(proc->context, __func__); binder_stats_created(BINDER_STAT_PROC); - hlist_add_head(&proc->proc_node, &binder_procs); + hlist_add_head(&proc->proc_node, &proc->context->binder_procs); proc->pid = current->group_leader->pid; INIT_LIST_HEAD(&proc->delivered_death); filp->private_data = proc; - binder_unlock(__func__); + binder_unlock(proc->context, __func__); if (binder_debugfs_dir_entry_proc) { char strbuf[11]; @@ -3543,6 +3556,7 @@ static int binder_release(struct inode *nodp, struct file *filp) static int binder_node_release(struct binder_node *node, int refs) { struct binder_ref *ref; + struct binder_context *context = node->proc->context; int death = 0; list_del_init(&node->work.entry); @@ -3558,7 +3572,7 @@ static int binder_node_release(struct binder_node *node, int refs) node->proc = NULL; node->local_strong_refs = 0; node->local_weak_refs = 0; - hlist_add_head(&node->dead_node, &binder_dead_nodes); + hlist_add_head(&node->dead_node, &context->binder_dead_nodes); hlist_for_each_entry(ref, &node->refs, node_entry) { refs++; @@ -3623,7 +3637,8 @@ static void binder_deferred_release(struct binder_proc *proc) node = rb_entry(n, struct binder_node, rb_node); nodes++; rb_erase(&node->rb_node, &proc->nodes); - incoming_refs = binder_node_release(node, incoming_refs); + incoming_refs = binder_node_release(node, + incoming_refs); } outgoing_refs = 0; @@ -3695,14 +3710,16 @@ static void binder_deferred_func(struct work_struct *work) { struct binder_proc *proc; struct files_struct *files; + struct binder_context *context = + container_of(work, struct binder_context, deferred_work); int defer; do { - binder_lock(__func__); - mutex_lock(&binder_deferred_lock); - if (!hlist_empty(&binder_deferred_list)) { - proc = hlist_entry(binder_deferred_list.first, + binder_lock(context, __func__); + mutex_lock(&context->binder_deferred_lock); + if (!hlist_empty(&context->binder_deferred_list)) { + proc = hlist_entry(context->binder_deferred_list.first, struct binder_proc, deferred_work_node); hlist_del_init(&proc->deferred_work_node); defer = proc->deferred_work; @@ -3711,7 +3728,7 @@ static void binder_deferred_func(struct work_struct *work) proc = NULL; defer = 0; } - mutex_unlock(&binder_deferred_lock); + mutex_unlock(&context->binder_deferred_lock); files = NULL; if (defer & BINDER_DEFERRED_PUT_FILES) { @@ -3726,24 +3743,24 @@ static void binder_deferred_func(struct work_struct *work) if (defer & BINDER_DEFERRED_RELEASE) binder_deferred_release(proc); /* frees proc */ - binder_unlock(__func__); + binder_unlock(context, __func__); if (files) put_files_struct(files); } while (proc); } -static DECLARE_WORK(binder_deferred_work, binder_deferred_func); static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer) { - mutex_lock(&binder_deferred_lock); + mutex_lock(&proc->context->binder_deferred_lock); proc->deferred_work |= defer; if (hlist_unhashed(&proc->deferred_work_node)) { hlist_add_head(&proc->deferred_work_node, - &binder_deferred_list); - queue_work(binder_deferred_workqueue, &binder_deferred_work); + &proc->context->binder_deferred_list); + queue_work(proc->context->binder_deferred_workqueue, + &proc->context->deferred_work); } - mutex_unlock(&binder_deferred_lock); + mutex_unlock(&proc->context->binder_deferred_lock); } static void print_binder_transaction(struct seq_file *m, const char *prefix, @@ -3974,8 +3991,20 @@ static const char * const binder_objstat_strings[] = { "transaction_complete" }; +static void add_binder_stats(struct binder_stats *from, struct binder_stats *to) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(to->bc); i++) + to->bc[i] += from->bc[i]; + + for (i = 0; i < ARRAY_SIZE(to->br); i++) + to->br[i] += from->br[i]; +} + static void print_binder_stats(struct seq_file *m, const char *prefix, - struct binder_stats *stats) + struct binder_stats *stats, + struct binder_obj_stats *obj_stats) { int i; @@ -3995,16 +4024,21 @@ static void print_binder_stats(struct seq_file *m, const char *prefix, binder_return_strings[i], stats->br[i]); } - BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != + if (!obj_stats) + return; + + BUILD_BUG_ON(ARRAY_SIZE(obj_stats->obj_created) != ARRAY_SIZE(binder_objstat_strings)); - BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != - ARRAY_SIZE(stats->obj_deleted)); - for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) { - if (stats->obj_created[i] || stats->obj_deleted[i]) + BUILD_BUG_ON(ARRAY_SIZE(obj_stats->obj_created) != + ARRAY_SIZE(obj_stats->obj_deleted)); + for (i = 0; i < ARRAY_SIZE(obj_stats->obj_created); i++) { + int obj_created = atomic_read(&obj_stats->obj_created[i]); + int obj_deleted = atomic_read(&obj_stats->obj_deleted[i]); + + if (obj_created || obj_deleted) seq_printf(m, "%s%s: active %d total %d\n", prefix, - binder_objstat_strings[i], - stats->obj_created[i] - stats->obj_deleted[i], - stats->obj_created[i]); + binder_objstat_strings[i], + obj_created - obj_deleted, obj_created); } } @@ -4059,85 +4093,131 @@ static void print_binder_proc_stats(struct seq_file *m, } seq_printf(m, " pending transactions: %d\n", count); - print_binder_stats(m, " ", &proc->stats); + print_binder_stats(m, " ", &proc->stats, NULL); } static int binder_state_show(struct seq_file *m, void *unused) { + struct binder_device *device; + struct binder_context *context; struct binder_proc *proc; struct binder_node *node; int do_lock = !binder_debug_no_lock; - - if (do_lock) - binder_lock(__func__); + bool wrote_dead_nodes_header = false; seq_puts(m, "binder state:\n"); - if (!hlist_empty(&binder_dead_nodes)) - seq_puts(m, "dead nodes:\n"); - hlist_for_each_entry(node, &binder_dead_nodes, dead_node) - print_binder_node(m, node); + hlist_for_each_entry(device, &binder_devices, hlist) { + context = &device->context; + if (do_lock) + binder_lock(context, __func__); + if (!wrote_dead_nodes_header && + !hlist_empty(&context->binder_dead_nodes)) { + seq_puts(m, "dead nodes:\n"); + wrote_dead_nodes_header = true; + } + hlist_for_each_entry(node, &context->binder_dead_nodes, + dead_node) + print_binder_node(m, node); - hlist_for_each_entry(proc, &binder_procs, proc_node) - print_binder_proc(m, proc, 1); - if (do_lock) - binder_unlock(__func__); + if (do_lock) + binder_unlock(context, __func__); + } + + hlist_for_each_entry(device, &binder_devices, hlist) { + context = &device->context; + if (do_lock) + binder_lock(context, __func__); + + hlist_for_each_entry(proc, &context->binder_procs, proc_node) + print_binder_proc(m, proc, 1); + if (do_lock) + binder_unlock(context, __func__); + } return 0; } static int binder_stats_show(struct seq_file *m, void *unused) { + struct binder_device *device; + struct binder_context *context; struct binder_proc *proc; + struct binder_stats total_binder_stats; int do_lock = !binder_debug_no_lock; - if (do_lock) - binder_lock(__func__); + memset(&total_binder_stats, 0, sizeof(struct binder_stats)); + + hlist_for_each_entry(device, &binder_devices, hlist) { + context = &device->context; + if (do_lock) + binder_lock(context, __func__); + + add_binder_stats(&context->binder_stats, &total_binder_stats); + + if (do_lock) + binder_unlock(context, __func__); + } seq_puts(m, "binder stats:\n"); + print_binder_stats(m, "", &total_binder_stats, &binder_obj_stats); - print_binder_stats(m, "", &binder_stats); + hlist_for_each_entry(device, &binder_devices, hlist) { + context = &device->context; + if (do_lock) + binder_lock(context, __func__); - hlist_for_each_entry(proc, &binder_procs, proc_node) - print_binder_proc_stats(m, proc); - if (do_lock) - binder_unlock(__func__); + hlist_for_each_entry(proc, &context->binder_procs, proc_node) + print_binder_proc_stats(m, proc); + if (do_lock) + binder_unlock(context, __func__); + } return 0; } static int binder_transactions_show(struct seq_file *m, void *unused) { + struct binder_device *device; + struct binder_context *context; struct binder_proc *proc; int do_lock = !binder_debug_no_lock; - if (do_lock) - binder_lock(__func__); - seq_puts(m, "binder transactions:\n"); - hlist_for_each_entry(proc, &binder_procs, proc_node) - print_binder_proc(m, proc, 0); - if (do_lock) - binder_unlock(__func__); + hlist_for_each_entry(device, &binder_devices, hlist) { + context = &device->context; + if (do_lock) + binder_lock(context, __func__); + + hlist_for_each_entry(proc, &context->binder_procs, proc_node) + print_binder_proc(m, proc, 0); + if (do_lock) + binder_unlock(context, __func__); + } return 0; } static int binder_proc_show(struct seq_file *m, void *unused) { + struct binder_device *device; + struct binder_context *context; struct binder_proc *itr; int pid = (unsigned long)m->private; int do_lock = !binder_debug_no_lock; - if (do_lock) - binder_lock(__func__); + hlist_for_each_entry(device, &binder_devices, hlist) { + context = &device->context; + if (do_lock) + binder_lock(context, __func__); - hlist_for_each_entry(itr, &binder_procs, proc_node) { - if (itr->pid == pid) { - seq_puts(m, "binder proc state:\n"); - print_binder_proc(m, itr, 1); + hlist_for_each_entry(itr, &context->binder_procs, proc_node) { + if (itr->pid == pid) { + seq_puts(m, "binder proc state:\n"); + print_binder_proc(m, itr, 1); + } } + if (do_lock) + binder_unlock(context, __func__); } - if (do_lock) - binder_unlock(__func__); return 0; } @@ -4152,11 +4232,10 @@ static void print_binder_transaction_log_entry(struct seq_file *m, e->to_node, e->target_handle, e->data_size, e->offsets_size); } -static int binder_transaction_log_show(struct seq_file *m, void *unused) +static int print_binder_transaction_log(struct seq_file *m, + struct binder_transaction_log *log) { - struct binder_transaction_log *log = m->private; int i; - if (log->full) { for (i = log->next; i < ARRAY_SIZE(log->entry); i++) print_binder_transaction_log_entry(m, &log->entry[i]); @@ -4166,6 +4245,31 @@ static int binder_transaction_log_show(struct seq_file *m, void *unused) return 0; } +static int binder_transaction_log_show(struct seq_file *m, void *unused) +{ + struct binder_device *device; + struct binder_context *context; + + hlist_for_each_entry(device, &binder_devices, hlist) { + context = &device->context; + print_binder_transaction_log(m, &context->transaction_log); + } + return 0; +} + +static int binder_failed_transaction_log_show(struct seq_file *m, void *unused) +{ + struct binder_device *device; + struct binder_context *context; + + hlist_for_each_entry(device, &binder_devices, hlist) { + context = &device->context; + print_binder_transaction_log(m, + &context->transaction_log_failed); + } + return 0; +} + static const struct file_operations binder_fops = { .owner = THIS_MODULE, .poll = binder_poll, @@ -4181,11 +4285,20 @@ BINDER_DEBUG_ENTRY(state); BINDER_DEBUG_ENTRY(stats); BINDER_DEBUG_ENTRY(transactions); BINDER_DEBUG_ENTRY(transaction_log); +BINDER_DEBUG_ENTRY(failed_transaction_log); + +static void __init free_binder_device(struct binder_device *device) +{ + if (device->context.binder_deferred_workqueue) + destroy_workqueue(device->context.binder_deferred_workqueue); + kfree(device); +} static int __init init_binder_device(const char *name) { int ret; struct binder_device *binder_device; + struct binder_context *context; binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL); if (!binder_device) @@ -4195,31 +4308,65 @@ static int __init init_binder_device(const char *name) binder_device->miscdev.minor = MISC_DYNAMIC_MINOR; binder_device->miscdev.name = name; - binder_device->context.binder_context_mgr_uid = INVALID_UID; - binder_device->context.name = name; + context = &binder_device->context; + context->binder_context_mgr_uid = INVALID_UID; + context->name = name; + + mutex_init(&context->binder_main_lock); + mutex_init(&context->binder_deferred_lock); + mutex_init(&context->binder_mmap_lock); + + context->binder_deferred_workqueue = + create_singlethread_workqueue(name); + + if (!context->binder_deferred_workqueue) { + ret = -ENOMEM; + goto err_create_singlethread_workqueue_failed; + } + + INIT_HLIST_HEAD(&context->binder_procs); + INIT_HLIST_HEAD(&context->binder_dead_nodes); + INIT_HLIST_HEAD(&context->binder_deferred_list); + INIT_WORK(&context->deferred_work, binder_deferred_func); ret = misc_register(&binder_device->miscdev); if (ret < 0) { - kfree(binder_device); - return ret; + goto err_misc_register_failed; } hlist_add_head(&binder_device->hlist, &binder_devices); + return ret; + +err_create_singlethread_workqueue_failed: +err_misc_register_failed: + free_binder_device(binder_device); return ret; } static int __init binder_init(void) { - int ret; + int ret = 0; char *device_name, *device_names; struct binder_device *device; struct hlist_node *tmp; - binder_deferred_workqueue = create_singlethread_workqueue("binder"); - if (!binder_deferred_workqueue) + /* + * Copy the module_parameter string, because we don't want to + * tokenize it in-place. + */ + device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL); + if (!device_names) return -ENOMEM; + strcpy(device_names, binder_devices_param); + + while ((device_name = strsep(&device_names, ","))) { + ret = init_binder_device(device_name); + if (ret) + goto err_init_binder_device_failed; + } + binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL); if (binder_debugfs_dir_entry_root) binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", @@ -4244,30 +4391,13 @@ static int __init binder_init(void) debugfs_create_file("transaction_log", S_IRUGO, binder_debugfs_dir_entry_root, - &binder_transaction_log, + NULL, &binder_transaction_log_fops); debugfs_create_file("failed_transaction_log", S_IRUGO, binder_debugfs_dir_entry_root, - &binder_transaction_log_failed, - &binder_transaction_log_fops); - } - - /* - * Copy the module_parameter string, because we don't want to - * tokenize it in-place. - */ - device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL); - if (!device_names) { - ret = -ENOMEM; - goto err_alloc_device_names_failed; - } - strcpy(device_names, binder_devices_param); - - while ((device_name = strsep(&device_names, ","))) { - ret = init_binder_device(device_name); - if (ret) - goto err_init_binder_device_failed; + NULL, + &binder_failed_transaction_log_fops); } return ret; @@ -4276,12 +4406,8 @@ err_init_binder_device_failed: hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) { misc_deregister(&device->miscdev); hlist_del(&device->hlist); - kfree(device); + free_binder_device(device); } -err_alloc_device_names_failed: - debugfs_remove_recursive(binder_debugfs_dir_entry_root); - - destroy_workqueue(binder_deferred_workqueue); return ret; } From f52e71a12e80643225e7b6faf9b4dc171229af39 Mon Sep 17 00:00:00 2001 From: Anson Jacob Date: Fri, 12 Aug 2016 20:38:10 -0400 Subject: [PATCH 0682/3220] usb: gadget: f_accessory: Fix for UsbAccessory clean unbind. Reapplying fix by Darren Whobrey (Change 69674) Fixes issues: 20545, 59667 and 61390. With prior version of f_accessory.c, UsbAccessories would not unbind cleanly when application is closed or i/o stopped while the usb cable is still connected. The accessory gadget driver would be left in an invalid state which was not reset on subsequent binding or opening. A reboot was necessary to clear. In some phones this issues causes the phone to reboot upon unplugging the USB cable. Main problem was that acc_disconnect was being called on I/O error which reset disconnected and online. Minor fix required to properly track setting and unsetting of disconnected and online flags. Also added urb Q wakeup's on unbind to help unblock waiting threads. Tested on Nexus 7 grouper. Expected behaviour now observed: closing accessory causes blocked i/o to interrupt with IOException. Accessory can be restarted following closing of file handle and re-opening. This is a generic fix that applies to all devices. Change-Id: I4e08b326730dd3a2820c863124cee10f7cb5501e Signed-off-by: Darren Whobrey Signed-off-by: Anson Jacob --- drivers/usb/gadget/function/f_accessory.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index f2fa0c271d70..76b8ae08a551 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -77,9 +77,13 @@ struct acc_dev { struct usb_ep *ep_in; struct usb_ep *ep_out; - /* set to 1 when we connect */ + /* online indicates state of function_set_alt & function_unbind + * set to 1 when we connect + */ int online:1; - /* Set to 1 when we disconnect. + + /* disconnected indicates state of open & release + * Set to 1 when we disconnect. * Not cleared until our file is closed. */ int disconnected:1; @@ -263,7 +267,6 @@ static struct usb_request *req_get(struct acc_dev *dev, struct list_head *head) static void acc_set_disconnected(struct acc_dev *dev) { - dev->online = 0; dev->disconnected = 1; } @@ -764,7 +767,10 @@ static int acc_release(struct inode *ip, struct file *fp) printk(KERN_INFO "acc_release\n"); WARN_ON(!atomic_xchg(&_acc_dev->open_excl, 0)); - _acc_dev->disconnected = 0; + /* indicate that we are disconnected + * still could be online so don't touch online flag + */ + _acc_dev->disconnected = 1; return 0; } @@ -1012,6 +1018,10 @@ acc_function_unbind(struct usb_configuration *c, struct usb_function *f) struct usb_request *req; int i; + dev->online = 0; /* clear online flag */ + wake_up(&dev->read_wq); /* unblock reads on closure */ + wake_up(&dev->write_wq); /* likewise for writes */ + while ((req = req_get(dev, &dev->tx_idle))) acc_request_free(req, dev->ep_in); for (i = 0; i < RX_REQ_MAX; i++) @@ -1143,6 +1153,7 @@ static int acc_function_set_alt(struct usb_function *f, } dev->online = 1; + dev->disconnected = 0; /* if online then not disconnected */ /* readers may be blocked waiting for us to go online */ wake_up(&dev->read_wq); @@ -1155,7 +1166,8 @@ static void acc_function_disable(struct usb_function *f) struct usb_composite_dev *cdev = dev->cdev; DBG(cdev, "acc_function_disable\n"); - acc_set_disconnected(dev); + acc_set_disconnected(dev); /* this now only sets disconnected */ + dev->online = 0; /* so now need to clear online flag here too */ usb_ep_disable(dev->ep_in); usb_ep_disable(dev->ep_out); From 6f4a2453a14bae428dd10ba3bb9c15dccfc9eb8d Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Fri, 10 Mar 2017 16:08:30 -0800 Subject: [PATCH 0683/3220] ANDROID: Replace spaces by '_' for some android filesystem tracepoints. Andoid files frequently have spaces in them, as do cmdline strings. Replace these spaces with '_', so tools that parse these tracepoints don't get terribly confused. Change-Id: I1cbbedf5c803aa6a58d9b8b7836e9125683c49d1 Signed-off-by: Mohan Srinivasan (cherry picked from commit 5035d5f0933758dd515327d038e5bef7e40dbaa7) --- include/trace/events/android_fs_template.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h index 4e61ffe7a814..b23d17b56c63 100644 --- a/include/trace/events/android_fs_template.h +++ b/include/trace/events/android_fs_template.h @@ -18,11 +18,18 @@ DECLARE_EVENT_CLASS(android_fs_data_start_template, ), TP_fast_assign( { + /* + * Replace the spaces in filenames and cmdlines + * because this screws up the tooling that parses + * the traces. + */ __assign_str(pathbuf, pathname); + (void)strreplace(__get_str(pathbuf), ' ', '_'); __entry->offset = offset; __entry->bytes = bytes; __entry->i_size = i_size_read(inode); __assign_str(cmdline, command); + (void)strreplace(__get_str(cmdline), ' ', '_'); __entry->pid = pid; __entry->ino = inode->i_ino; } From 174a03a0bdc252ec122d27deca311f6caa0c2810 Mon Sep 17 00:00:00 2001 From: "Jon Medhurst (Tixy)" Date: Wed, 9 Dec 2015 09:40:53 +0000 Subject: [PATCH 0684/3220] arm64: dts: juno: Add idle-states to device tree This patch adds idle-states bindings data collected through a set of benchmarking experiments (latency and energy consumption) on Juno boards. Latencies data represents the worst case scenarios as required by the DT idle-states bindings. Change-Id: I7b2d81fa66f8ce8b229457cfefff06e9edd545c7 (cherry picked from commit 286896f43b0248960f69660159b507b23751b38a) Signed-off-by: Jon Medhurst Acked-by: Lorenzo Pieralisi Signed-off-by: Olof Johansson --- arch/arm64/boot/dts/arm/juno-r1.dts | 28 ++++++++++++++++++++++++++++ arch/arm64/boot/dts/arm/juno.dts | 28 ++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/arch/arm64/boot/dts/arm/juno-r1.dts b/arch/arm64/boot/dts/arm/juno-r1.dts index 93bc3d7d51c0..8826f834f54f 100644 --- a/arch/arm64/boot/dts/arm/juno-r1.dts +++ b/arch/arm64/boot/dts/arm/juno-r1.dts @@ -60,6 +60,28 @@ }; }; + idle-states { + entry-method = "arm,psci"; + + CPU_SLEEP_0: cpu-sleep-0 { + compatible = "arm,idle-state"; + arm,psci-suspend-param = <0x0010000>; + local-timer-stop; + entry-latency-us = <300>; + exit-latency-us = <1200>; + min-residency-us = <2000>; + }; + + CLUSTER_SLEEP_0: cluster-sleep-0 { + compatible = "arm,idle-state"; + arm,psci-suspend-param = <0x1010000>; + local-timer-stop; + entry-latency-us = <300>; + exit-latency-us = <1200>; + min-residency-us = <2500>; + }; + }; + A57_0: cpu@0 { compatible = "arm,cortex-a57","arm,armv8"; reg = <0x0 0x0>; @@ -67,6 +89,7 @@ enable-method = "psci"; next-level-cache = <&A57_L2>; clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A57_1: cpu@1 { @@ -76,6 +99,7 @@ enable-method = "psci"; next-level-cache = <&A57_L2>; clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A53_0: cpu@100 { @@ -85,6 +109,7 @@ enable-method = "psci"; next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A53_1: cpu@101 { @@ -94,6 +119,7 @@ enable-method = "psci"; next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A53_2: cpu@102 { @@ -103,6 +129,7 @@ enable-method = "psci"; next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A53_3: cpu@103 { @@ -112,6 +139,7 @@ enable-method = "psci"; next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A57_L2: l2-cache0 { diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts index 53442b5ee4ff..dcfcf15a17f5 100644 --- a/arch/arm64/boot/dts/arm/juno.dts +++ b/arch/arm64/boot/dts/arm/juno.dts @@ -60,6 +60,28 @@ }; }; + idle-states { + entry-method = "arm,psci"; + + CPU_SLEEP_0: cpu-sleep-0 { + compatible = "arm,idle-state"; + arm,psci-suspend-param = <0x0010000>; + local-timer-stop; + entry-latency-us = <300>; + exit-latency-us = <1200>; + min-residency-us = <2000>; + }; + + CLUSTER_SLEEP_0: cluster-sleep-0 { + compatible = "arm,idle-state"; + arm,psci-suspend-param = <0x1010000>; + local-timer-stop; + entry-latency-us = <300>; + exit-latency-us = <1200>; + min-residency-us = <2500>; + }; + }; + A57_0: cpu@0 { compatible = "arm,cortex-a57","arm,armv8"; reg = <0x0 0x0>; @@ -67,6 +89,7 @@ enable-method = "psci"; next-level-cache = <&A57_L2>; clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A57_1: cpu@1 { @@ -76,6 +99,7 @@ enable-method = "psci"; next-level-cache = <&A57_L2>; clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A53_0: cpu@100 { @@ -85,6 +109,7 @@ enable-method = "psci"; next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A53_1: cpu@101 { @@ -94,6 +119,7 @@ enable-method = "psci"; next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A53_2: cpu@102 { @@ -103,6 +129,7 @@ enable-method = "psci"; next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A53_3: cpu@103 { @@ -112,6 +139,7 @@ enable-method = "psci"; next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; }; A57_L2: l2-cache0 { From a2849d45025ca5894e99502752a86ebc744a4361 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 13 Nov 2015 10:21:39 +0000 Subject: [PATCH 0685/3220] DTB: Add EAS compatible Juno Energy model to 'juno.dts' EAS expects the energy model for the CPUs and cluster states to be available in the DTB. The energy model data comes from previous versions. Change-Id: I87535c8d802797361333929d809b43383bc8954b (cherry picked from commit bf137f205f312a1814ae38f908ec7bdbdddeaa3e (LSK 4.4)) Signed-off-by: Chris Redpath Signed-off-by: Punit Agrawal Signed-off-by: Jon Medhurst --- .../arm64/boot/dts/arm/juno-sched-energy.dtsi | 147 ++++++++++++++++++ arch/arm64/boot/dts/arm/juno.dts | 8 + 2 files changed, 155 insertions(+) create mode 100644 arch/arm64/boot/dts/arm/juno-sched-energy.dtsi diff --git a/arch/arm64/boot/dts/arm/juno-sched-energy.dtsi b/arch/arm64/boot/dts/arm/juno-sched-energy.dtsi new file mode 100644 index 000000000000..38207e4391ab --- /dev/null +++ b/arch/arm64/boot/dts/arm/juno-sched-energy.dtsi @@ -0,0 +1,147 @@ +/* + * ARM JUNO specific energy cost model data. There are no unit requirements for + * the data. Data can be normalized to any reference point, but the + * normalization must be consistent. That is, one bogo-joule/watt must be the + * same quantity for all data, but we don't care what it is. + */ + +/* static struct idle_state idle_states_cluster_a53[] = { */ +/* { .power = 56 }, /\* arch_cpu_idle() (active idle) = WFI *\/ */ +/* { .power = 56 }, /\* WFI *\/ */ +/* { .power = 56 }, /\* cpu-sleep-0 *\/ */ +/* { .power = 17 }, /\* cluster-sleep-0 *\/ */ +/* }; */ + +/* static struct idle_state idle_states_cluster_a57[] = { */ +/* { .power = 65 }, /\* arch_cpu_idle() (active idle) = WFI *\/ */ +/* { .power = 65 }, /\* WFI *\/ */ +/* { .power = 65 }, /\* cpu-sleep-0 *\/ */ +/* { .power = 24 }, /\* cluster-sleep-0 *\/ */ +/* }; */ + +/* static struct capacity_state cap_states_cluster_a53[] = { */ +/* /\* Power per cluster *\/ */ +/* { .cap = 235, .power = 26, }, /\* 450 MHz *\/ */ +/* { .cap = 303, .power = 30, }, /\* 575 MHz *\/ */ +/* { .cap = 368, .power = 39, }, /\* 700 MHz *\/ */ +/* { .cap = 406, .power = 47, }, /\* 775 MHz *\/ */ +/* { .cap = 447, .power = 57, }, /\* 850 Mhz *\/ */ +/* }; */ + +/* static struct capacity_state cap_states_cluster_a57[] = { */ +/* /\* Power per cluster *\/ */ +/* { .cap = 417, .power = 24, }, /\* 450 MHz *\/ */ +/* { .cap = 579, .power = 32, }, /\* 625 MHz *\/ */ +/* { .cap = 744, .power = 43, }, /\* 800 MHz *\/ */ +/* { .cap = 883, .power = 49, }, /\* 950 MHz *\/ */ +/* { .cap = 1024, .power = 64, }, /\* 1100 MHz *\/ */ +/* }; */ + +/* static struct sched_group_energy energy_cluster_a53 = { */ +/* .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a53), */ +/* .idle_states = idle_states_cluster_a53, */ +/* .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a53), */ +/* .cap_states = cap_states_cluster_a53, */ +/* }; */ + +/* static struct sched_group_energy energy_cluster_a57 = { */ +/* .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a57), */ +/* .idle_states = idle_states_cluster_a57, */ +/* .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a57), */ +/* .cap_states = cap_states_cluster_a57, */ +/* }; */ + +/* static struct idle_state idle_states_core_a53[] = { */ +/* { .power = 6 }, /\* arch_cpu_idle() (active idle) = WFI *\/ */ +/* { .power = 6 }, /\* WFI *\/ */ +/* { .power = 0 }, /\* cpu-sleep-0 *\/ */ +/* { .power = 0 }, /\* cluster-sleep-0 *\/ */ +/* }; */ + +/* static struct idle_state idle_states_core_a57[] = { */ +/* { .power = 15 }, /\* arch_cpu_idle() (active idle) = WFI *\/ */ +/* { .power = 15 }, /\* WFI *\/ */ +/* { .power = 0 }, /\* cpu-sleep-0 *\/ */ +/* { .power = 0 }, /\* cluster-sleep-0 *\/ */ +/* }; */ + +/* static struct capacity_state cap_states_core_a53[] = { */ +/* /\* Power per cpu *\/ */ +/* { .cap = 235, .power = 33, }, /\* 450 MHz *\/ */ +/* { .cap = 302, .power = 46, }, /\* 575 MHz *\/ */ +/* { .cap = 368, .power = 61, }, /\* 700 MHz *\/ */ +/* { .cap = 406, .power = 76, }, /\* 775 MHz *\/ */ +/* { .cap = 447, .power = 93, }, /\* 850 Mhz *\/ */ +/* }; */ + +/* static struct capacity_state cap_states_core_a57[] = { */ +/* /\* Power per cpu *\/ */ +/* { .cap = 417, .power = 168, }, /\* 450 MHz *\/ */ +/* { .cap = 579, .power = 251, }, /\* 625 MHz *\/ */ +/* { .cap = 744, .power = 359, }, /\* 800 MHz *\/ */ +/* { .cap = 883, .power = 479, }, /\* 950 MHz *\/ */ +/* { .cap = 1024, .power = 616, }, /\* 1100 MHz *\/ */ +/* }; */ + +energy-costs { + CPU_COST_A57: core-cost0 { + busy-cost-data = < + 417 168 + 579 251 + 744 359 + 883 479 + 1023 616 + >; + idle-cost-data = < + 15 + 15 + 0 + 0 + >; + }; + CPU_COST_A53: core-cost1 { + busy-cost-data = < + 235 33 + 302 46 + 368 61 + 406 76 + 447 93 + >; + idle-cost-data = < + 6 + 6 + 0 + 0 + >; + }; + CLUSTER_COST_A57: cluster-cost0 { + busy-cost-data = < + 417 24 + 579 32 + 744 43 + 883 49 + 1024 64 + >; + idle-cost-data = < + 65 + 65 + 65 + 24 + >; + }; + CLUSTER_COST_A53: cluster-cost1 { + busy-cost-data = < + 235 26 + 303 30 + 368 39 + 406 47 + 447 57 + >; + idle-cost-data = < + 56 + 56 + 56 + 17 + >; + }; +}; diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts index dcfcf15a17f5..f113a80519cd 100644 --- a/arch/arm64/boot/dts/arm/juno.dts +++ b/arch/arm64/boot/dts/arm/juno.dts @@ -90,6 +90,7 @@ next-level-cache = <&A57_L2>; clocks = <&scpi_dvfs 0>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_A57 &CLUSTER_COST_A57>; }; A57_1: cpu@1 { @@ -100,6 +101,7 @@ next-level-cache = <&A57_L2>; clocks = <&scpi_dvfs 0>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_A57 &CLUSTER_COST_A57>; }; A53_0: cpu@100 { @@ -110,6 +112,7 @@ next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>; }; A53_1: cpu@101 { @@ -120,6 +123,7 @@ next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>; }; A53_2: cpu@102 { @@ -130,6 +134,7 @@ next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>; }; A53_3: cpu@103 { @@ -140,6 +145,7 @@ next-level-cache = <&A53_L2>; clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>; }; A57_L2: l2-cache0 { @@ -149,6 +155,8 @@ A53_L2: l2-cache1 { compatible = "cache"; }; + + /include/ "juno-sched-energy.dtsi" }; pmu_a57 { From bce3e4dd9d49f2a3da44841a6b0b6dcb56ec149d Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Tue, 10 Jan 2017 16:10:35 -0800 Subject: [PATCH 0686/3220] ANDROID: uid_cputime: add per-uid IO usage accounting IO usages are accounted in foreground and background buckets. For each uid, io usage is calculated in two steps. delta = current total of all uid tasks - previus total current bucket += delta Bucket is determined by current uid stat. Userspace writes to /proc/uid_procstat/set when uid stat is updated. /proc/uid_io/stats shows IO usage in this format. Signed-off-by: Jin Qian Bug: 34198239 Change-Id: Ib8bebda53e7a56f45ea3eb0ec9a3153d44188102 --- drivers/misc/uid_cputime.c | 236 ++++++++++++++++++++++++++++++++++--- 1 file changed, 220 insertions(+), 16 deletions(-) diff --git a/drivers/misc/uid_cputime.c b/drivers/misc/uid_cputime.c index c1ad5246f564..f5135bb01a7a 100644 --- a/drivers/misc/uid_cputime.c +++ b/drivers/misc/uid_cputime.c @@ -30,7 +30,24 @@ DECLARE_HASHTABLE(hash_table, UID_HASH_BITS); static DEFINE_MUTEX(uid_lock); -static struct proc_dir_entry *parent; +static struct proc_dir_entry *cpu_parent; +static struct proc_dir_entry *io_parent; +static struct proc_dir_entry *proc_parent; + +struct io_stats { + u64 read_bytes; + u64 write_bytes; + u64 rchar; + u64 wchar; +}; + +#define UID_STATE_FOREGROUND 0 +#define UID_STATE_BACKGROUND 1 +#define UID_STATE_BUCKET_SIZE 2 + +#define UID_STATE_TOTAL_CURR 2 +#define UID_STATE_TOTAL_LAST 3 +#define UID_STATE_SIZE 4 struct uid_entry { uid_t uid; @@ -38,6 +55,8 @@ struct uid_entry { cputime_t stime; cputime_t active_utime; cputime_t active_stime; + int state; + struct io_stats io[UID_STATE_SIZE]; struct hlist_node hash; }; @@ -70,7 +89,7 @@ static struct uid_entry *find_or_register_uid(uid_t uid) return uid_entry; } -static int uid_stat_show(struct seq_file *m, void *v) +static int uid_cputime_show(struct seq_file *m, void *v) { struct uid_entry *uid_entry; struct task_struct *task, *temp; @@ -119,13 +138,13 @@ static int uid_stat_show(struct seq_file *m, void *v) return 0; } -static int uid_stat_open(struct inode *inode, struct file *file) +static int uid_cputime_open(struct inode *inode, struct file *file) { - return single_open(file, uid_stat_show, PDE_DATA(inode)); + return single_open(file, uid_cputime_show, PDE_DATA(inode)); } -static const struct file_operations uid_stat_fops = { - .open = uid_stat_open, +static const struct file_operations uid_cputime_fops = { + .open = uid_cputime_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, @@ -184,6 +203,162 @@ static const struct file_operations uid_remove_fops = { .write = uid_remove_write, }; +static void add_uid_io_curr_stats(struct uid_entry *uid_entry, + struct task_struct *task) +{ + struct io_stats *io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR]; + + io_curr->read_bytes += task->ioac.read_bytes; + io_curr->write_bytes += + task->ioac.write_bytes - task->ioac.cancelled_write_bytes; + io_curr->rchar += task->ioac.rchar; + io_curr->wchar += task->ioac.wchar; +} + +static void clean_uid_io_last_stats(struct uid_entry *uid_entry, + struct task_struct *task) +{ + struct io_stats *io_last = &uid_entry->io[UID_STATE_TOTAL_LAST]; + + io_last->read_bytes -= task->ioac.read_bytes; + io_last->write_bytes -= + task->ioac.write_bytes - task->ioac.cancelled_write_bytes; + io_last->rchar -= task->ioac.rchar; + io_last->wchar -= task->ioac.wchar; +} + +static void update_io_stats_locked(void) +{ + struct uid_entry *uid_entry; + struct task_struct *task, *temp; + struct io_stats *io_bucket, *io_curr, *io_last; + unsigned long bkt; + + BUG_ON(!mutex_is_locked(&uid_lock)); + + hash_for_each(hash_table, bkt, uid_entry, hash) + memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, + sizeof(struct io_stats)); + + read_lock(&tasklist_lock); + do_each_thread(temp, task) { + uid_entry = find_or_register_uid(from_kuid_munged( + current_user_ns(), task_uid(task))); + if (!uid_entry) + continue; + add_uid_io_curr_stats(uid_entry, task); + } while_each_thread(temp, task); + read_unlock(&tasklist_lock); + + hash_for_each(hash_table, bkt, uid_entry, hash) { + io_bucket = &uid_entry->io[uid_entry->state]; + io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR]; + io_last = &uid_entry->io[UID_STATE_TOTAL_LAST]; + + io_bucket->read_bytes += + io_curr->read_bytes - io_last->read_bytes; + io_bucket->write_bytes += + io_curr->write_bytes - io_last->write_bytes; + io_bucket->rchar += io_curr->rchar - io_last->rchar; + io_bucket->wchar += io_curr->wchar - io_last->wchar; + + io_last->read_bytes = io_curr->read_bytes; + io_last->write_bytes = io_curr->write_bytes; + io_last->rchar = io_curr->rchar; + io_last->wchar = io_curr->wchar; + } +} + +static int uid_io_show(struct seq_file *m, void *v) +{ + struct uid_entry *uid_entry; + unsigned long bkt; + + mutex_lock(&uid_lock); + + update_io_stats_locked(); + + hash_for_each(hash_table, bkt, uid_entry, hash) { + seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu\n", + uid_entry->uid, + uid_entry->io[UID_STATE_FOREGROUND].rchar, + uid_entry->io[UID_STATE_FOREGROUND].wchar, + uid_entry->io[UID_STATE_FOREGROUND].read_bytes, + uid_entry->io[UID_STATE_FOREGROUND].write_bytes, + uid_entry->io[UID_STATE_BACKGROUND].rchar, + uid_entry->io[UID_STATE_BACKGROUND].wchar, + uid_entry->io[UID_STATE_BACKGROUND].read_bytes, + uid_entry->io[UID_STATE_BACKGROUND].write_bytes); + } + + mutex_unlock(&uid_lock); + + return 0; +} + +static int uid_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, uid_io_show, PDE_DATA(inode)); +} + +static const struct file_operations uid_io_fops = { + .open = uid_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int uid_procstat_open(struct inode *inode, struct file *file) +{ + return single_open(file, NULL, NULL); +} + +static ssize_t uid_procstat_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + struct uid_entry *uid_entry; + uid_t uid; + int argc, state; + char input[128]; + + if (count >= sizeof(input)) + return -EINVAL; + + if (copy_from_user(input, buffer, count)) + return -EFAULT; + + input[count] = '\0'; + + argc = sscanf(input, "%u %d", &uid, &state); + if (argc != 2) + return -EINVAL; + + if (state != UID_STATE_BACKGROUND && state != UID_STATE_FOREGROUND) + return -EINVAL; + + mutex_lock(&uid_lock); + + uid_entry = find_or_register_uid(uid); + if (!uid_entry || uid_entry->state == state) { + mutex_unlock(&uid_lock); + return -EINVAL; + } + + update_io_stats_locked(); + + uid_entry->state = state; + + mutex_unlock(&uid_lock); + + return count; +} + +static const struct file_operations uid_procstat_fops = { + .open = uid_procstat_open, + .release = single_release, + .write = uid_procstat_write, +}; + static int process_notifier(struct notifier_block *self, unsigned long cmd, void *v) { @@ -207,6 +382,9 @@ static int process_notifier(struct notifier_block *self, uid_entry->utime += utime; uid_entry->stime += stime; + update_io_stats_locked(); + clean_uid_io_last_stats(uid_entry, task); + exit: mutex_unlock(&uid_lock); return NOTIFY_OK; @@ -216,25 +394,51 @@ static struct notifier_block process_notifier_block = { .notifier_call = process_notifier, }; -static int __init proc_uid_cputime_init(void) +static int __init proc_uid_sys_stats_init(void) { hash_init(hash_table); - parent = proc_mkdir("uid_cputime", NULL); - if (!parent) { - pr_err("%s: failed to create proc entry\n", __func__); - return -ENOMEM; + cpu_parent = proc_mkdir("uid_cputime", NULL); + if (!cpu_parent) { + pr_err("%s: failed to create uid_cputime proc entry\n", + __func__); + goto err; } - proc_create_data("remove_uid_range", S_IWUGO, parent, &uid_remove_fops, - NULL); + proc_create_data("remove_uid_range", 0222, cpu_parent, + &uid_remove_fops, NULL); + proc_create_data("show_uid_stat", 0444, cpu_parent, + &uid_cputime_fops, NULL); - proc_create_data("show_uid_stat", S_IRUGO, parent, &uid_stat_fops, - NULL); + io_parent = proc_mkdir("uid_io", NULL); + if (!io_parent) { + pr_err("%s: failed to create uid_io proc entry\n", + __func__); + goto err; + } + + proc_create_data("stats", 0444, io_parent, + &uid_io_fops, NULL); + + proc_parent = proc_mkdir("uid_procstat", NULL); + if (!proc_parent) { + pr_err("%s: failed to create uid_procstat proc entry\n", + __func__); + goto err; + } + + proc_create_data("set", 0222, proc_parent, + &uid_procstat_fops, NULL); profile_event_register(PROFILE_TASK_EXIT, &process_notifier_block); return 0; + +err: + remove_proc_subtree("uid_cputime", NULL); + remove_proc_subtree("uid_io", NULL); + remove_proc_subtree("uid_procstat", NULL); + return -ENOMEM; } -early_initcall(proc_uid_cputime_init); +early_initcall(proc_uid_sys_stats_init); From 6a61b529b4a92817adfb1d1393c6b914419a4555 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Tue, 10 Jan 2017 16:11:07 -0800 Subject: [PATCH 0687/3220] ANDROID: uid_sys_stats: rename uid_cputime.c to uid_sys_stats.c This module tracks cputime and io stats. Signed-off-by: Jin Qian Bug: 34198239 Change-Id: I9ee7d9e915431e0bb714b36b5a2282e1fdcc7342 --- android/configs/android-base.cfg | 2 +- drivers/misc/Kconfig | 6 ++++-- drivers/misc/Makefile | 2 +- drivers/misc/{uid_cputime.c => uid_sys_stats.c} | 0 4 files changed, 6 insertions(+), 4 deletions(-) rename drivers/misc/{uid_cputime.c => uid_sys_stats.c} (100%) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index f10371a981b7..2098fe97198c 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -157,7 +157,7 @@ CONFIG_STAGING=y CONFIG_SWP_EMULATION=y CONFIG_SYNC=y CONFIG_TUN=y -CONFIG_UID_CPUTIME=y +CONFIG_UID_SYS_STATS=y CONFIG_UNIX=y CONFIG_USB_GADGET=y CONFIG_USB_CONFIGFS=y diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 06eddc0cb24f..5847d3be0835 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -525,11 +525,13 @@ config VEXPRESS_SYSCFG bus. System Configuration interface is one of the possible means of generating transactions on this bus. -config UID_CPUTIME - bool "Per-UID cpu time statistics" +config UID_SYS_STATS + bool "Per-UID statistics" depends on PROFILING help Per UID based cpu time statistics exported to /proc/uid_cputime + Per UID based io statistics exported to /proc/uid_io + Per UID based procstat control in /proc/uid_procstat config MEMORY_STATE_TIME tristate "Memory freq/bandwidth time statistics" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index b76b4c9fe104..9a3b402921b2 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -56,5 +56,5 @@ obj-$(CONFIG_GENWQE) += genwqe/ obj-$(CONFIG_ECHO) += echo/ obj-$(CONFIG_VEXPRESS_SYSCFG) += vexpress-syscfg.o obj-$(CONFIG_CXL_BASE) += cxl/ -obj-$(CONFIG_UID_CPUTIME) += uid_cputime.o +obj-$(CONFIG_UID_SYS_STATS) += uid_sys_stats.o obj-$(CONFIG_MEMORY_STATE_TIME) += memory_state_time.o diff --git a/drivers/misc/uid_cputime.c b/drivers/misc/uid_sys_stats.c similarity index 100% rename from drivers/misc/uid_cputime.c rename to drivers/misc/uid_sys_stats.c From 5a420edf10dc41f907ba5b8175a9befde3219096 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Tue, 17 Jan 2017 17:26:07 -0800 Subject: [PATCH 0688/3220] ANDROID: uid_sys_stats: allow writing same state Signed-off-by: Jin Qian Bug: 34360629 Change-Id: Ia748351e07910b1febe54f0484ca1be58c4eb9c7 --- drivers/misc/uid_sys_stats.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index f5135bb01a7a..5f82d17e7ad9 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -339,11 +339,16 @@ static ssize_t uid_procstat_write(struct file *file, mutex_lock(&uid_lock); uid_entry = find_or_register_uid(uid); - if (!uid_entry || uid_entry->state == state) { + if (!uid_entry) { mutex_unlock(&uid_lock); return -EINVAL; } + if (uid_entry->state == state) { + mutex_unlock(&uid_lock); + return count; + } + update_io_stats_locked(); uid_entry->state = state; From 5c866b0f8a2ff6d75fece4463fb9162ca43a4a3a Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Tue, 28 Feb 2017 15:09:42 -0800 Subject: [PATCH 0689/3220] ANDROID: uid_sys_stats: fix negative write bytes. A task can cancel writes made by other tasks. In rare cases, cancelled_write_bytes is larger than write_bytes if the task itself didn't make any write. This doesn't affect total size but may cause confusion when looking at IO usage on individual tasks. Bug: 35851986 Change-Id: If6cb549aeef9e248e18d804293401bb2b91918ca Signed-off-by: Jin Qian --- drivers/misc/uid_sys_stats.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 5f82d17e7ad9..7b746acf416c 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -203,14 +203,21 @@ static const struct file_operations uid_remove_fops = { .write = uid_remove_write, }; +static u64 compute_write_bytes(struct task_struct *task) +{ + if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes) + return 0; + + return task->ioac.write_bytes - task->ioac.cancelled_write_bytes; +} + static void add_uid_io_curr_stats(struct uid_entry *uid_entry, struct task_struct *task) { struct io_stats *io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR]; io_curr->read_bytes += task->ioac.read_bytes; - io_curr->write_bytes += - task->ioac.write_bytes - task->ioac.cancelled_write_bytes; + io_curr->write_bytes += compute_write_bytes(task); io_curr->rchar += task->ioac.rchar; io_curr->wchar += task->ioac.wchar; } @@ -221,8 +228,7 @@ static void clean_uid_io_last_stats(struct uid_entry *uid_entry, struct io_stats *io_last = &uid_entry->io[UID_STATE_TOTAL_LAST]; io_last->read_bytes -= task->ioac.read_bytes; - io_last->write_bytes -= - task->ioac.write_bytes - task->ioac.cancelled_write_bytes; + io_last->write_bytes -= compute_write_bytes(task); io_last->rchar -= task->ioac.rchar; io_last->wchar -= task->ioac.wchar; } From a4f5f251e951d2e6f454f82c58e7044f7de87a0d Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 2 Mar 2017 13:32:59 -0800 Subject: [PATCH 0690/3220] ANDROID: sched: add a counter to track fsync Change-Id: I6c138de5b2332eea70f57e098134d1d141247b3f Signed-off-by: Jin Qian --- fs/sync.c | 1 + include/linux/sched.h | 8 ++++++++ include/linux/task_io_accounting.h | 2 ++ include/linux/task_io_accounting_ops.h | 1 + 4 files changed, 12 insertions(+) diff --git a/fs/sync.c b/fs/sync.c index dd5d1711c7ac..452179e31c39 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -218,6 +218,7 @@ static int do_fsync(unsigned int fd, int datasync) if (f.file) { ret = vfs_fsync(f.file, datasync); fdput(f); + inc_syscfs(current); } return ret; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 8be9f0dbdd0c..5b250c9f7718 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3227,6 +3227,11 @@ static inline void inc_syscw(struct task_struct *tsk) { tsk->ioac.syscw++; } + +static inline void inc_syscfs(struct task_struct *tsk) +{ + tsk->ioac.syscfs++; +} #else static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { @@ -3243,6 +3248,9 @@ static inline void inc_syscr(struct task_struct *tsk) static inline void inc_syscw(struct task_struct *tsk) { } +static inline void inc_syscfs(struct task_struct *tsk) +{ +} #endif #ifndef TASK_SIZE_OF diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h index bdf855c2856f..2dd338fdf881 100644 --- a/include/linux/task_io_accounting.h +++ b/include/linux/task_io_accounting.h @@ -18,6 +18,8 @@ struct task_io_accounting { u64 syscr; /* # of write syscalls */ u64 syscw; + /* # of fsync syscalls */ + u64 syscfs; #endif /* CONFIG_TASK_XACCT */ #ifdef CONFIG_TASK_IO_ACCOUNTING diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h index 4d090f9ee608..1b505c804af3 100644 --- a/include/linux/task_io_accounting_ops.h +++ b/include/linux/task_io_accounting_ops.h @@ -96,6 +96,7 @@ static inline void task_chr_io_accounting_add(struct task_io_accounting *dst, dst->wchar += src->wchar; dst->syscr += src->syscr; dst->syscw += src->syscw; + dst->syscfs += src->syscfs; } #else static inline void task_chr_io_accounting_add(struct task_io_accounting *dst, From 3f7fac35ec629680d3cb3b83cc321dab736aa6cc Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 2 Mar 2017 13:39:43 -0800 Subject: [PATCH 0691/3220] ANDROID: uid_sys_stats: account for fsync syscalls Change-Id: Ie888d8a0f4ec7a27dea86dc4afba8e6fd4203488 Signed-off-by: Jin Qian --- drivers/misc/uid_sys_stats.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 7b746acf416c..4988e323cf02 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -39,6 +39,7 @@ struct io_stats { u64 write_bytes; u64 rchar; u64 wchar; + u64 fsync; }; #define UID_STATE_FOREGROUND 0 @@ -220,6 +221,7 @@ static void add_uid_io_curr_stats(struct uid_entry *uid_entry, io_curr->write_bytes += compute_write_bytes(task); io_curr->rchar += task->ioac.rchar; io_curr->wchar += task->ioac.wchar; + io_curr->fsync += task->ioac.syscfs; } static void clean_uid_io_last_stats(struct uid_entry *uid_entry, @@ -231,6 +233,7 @@ static void clean_uid_io_last_stats(struct uid_entry *uid_entry, io_last->write_bytes -= compute_write_bytes(task); io_last->rchar -= task->ioac.rchar; io_last->wchar -= task->ioac.wchar; + io_last->fsync -= task->ioac.syscfs; } static void update_io_stats_locked(void) @@ -267,11 +270,13 @@ static void update_io_stats_locked(void) io_curr->write_bytes - io_last->write_bytes; io_bucket->rchar += io_curr->rchar - io_last->rchar; io_bucket->wchar += io_curr->wchar - io_last->wchar; + io_bucket->fsync += io_curr->fsync - io_last->fsync; io_last->read_bytes = io_curr->read_bytes; io_last->write_bytes = io_curr->write_bytes; io_last->rchar = io_curr->rchar; io_last->wchar = io_curr->wchar; + io_last->fsync = io_curr->fsync; } } @@ -285,7 +290,7 @@ static int uid_io_show(struct seq_file *m, void *v) update_io_stats_locked(); hash_for_each(hash_table, bkt, uid_entry, hash) { - seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu\n", + seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", uid_entry->uid, uid_entry->io[UID_STATE_FOREGROUND].rchar, uid_entry->io[UID_STATE_FOREGROUND].wchar, @@ -294,7 +299,9 @@ static int uid_io_show(struct seq_file *m, void *v) uid_entry->io[UID_STATE_BACKGROUND].rchar, uid_entry->io[UID_STATE_BACKGROUND].wchar, uid_entry->io[UID_STATE_BACKGROUND].read_bytes, - uid_entry->io[UID_STATE_BACKGROUND].write_bytes); + uid_entry->io[UID_STATE_BACKGROUND].write_bytes, + uid_entry->io[UID_STATE_FOREGROUND].fsync, + uid_entry->io[UID_STATE_BACKGROUND].fsync); } mutex_unlock(&uid_lock); From 3009d5325611867c6a185272aaf9b733946d8852 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 1 Mar 2017 17:04:41 -0800 Subject: [PATCH 0692/3220] ANDROID: sdcardfs: Fix case insensitive lookup The previous case insensitive lookup relied on the entry being present in the dcache. This instead uses iterate_dir to find the correct case. Signed-off-by: Daniel Rosenberg bug: 35633782 Change-Id: I556f7090773468c1943c89a5e2aa07f746ba49c5 --- fs/sdcardfs/lookup.c | 68 +++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 6b595e892316..bbfb0775c700 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -206,6 +206,28 @@ out: return err; } +struct sdcardfs_name_data { + struct dir_context ctx; + const struct qstr *to_find; + char *name; + bool found; +}; + +static int sdcardfs_name_match(struct dir_context *ctx, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct sdcardfs_name_data *buf = container_of(ctx, struct sdcardfs_name_data, ctx); + struct qstr candidate = QSTR_INIT(name, namelen); + + if (qstr_case_eq(buf->to_find, &candidate)) { + memcpy(buf->name, name, namelen); + buf->name[namelen] = 0; + buf->found = true; + return 1; + } + return 0; +} + /* * Main driver function for sdcardfs's lookup. * @@ -242,27 +264,39 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, &lower_path); /* check for other cases */ if (err == -ENOENT) { - struct dentry *child; - struct dentry *match = NULL; - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); - spin_lock(&lower_dir_dentry->d_lock); - list_for_each_entry(child, &lower_dir_dentry->d_subdirs, d_child) { - if (child && d_inode(child)) { - if (qstr_case_eq(&child->d_name, name)) { - match = dget(child); - break; - } - } + struct file *file; + const struct cred *cred = current_cred(); + + struct sdcardfs_name_data buffer = { + .ctx.actor = sdcardfs_name_match, + .to_find = name, + .name = __getname(), + .found = false, + }; + + if (!buffer.name) { + err = -ENOMEM; + goto out; } - spin_unlock(&lower_dir_dentry->d_lock); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); - if (match) { + file = dentry_open(lower_parent_path, O_RDONLY, cred); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto put_name; + } + err = iterate_dir(file, &buffer.ctx); + fput(file); + if (err) + goto put_name; + + if (buffer.found) err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, - match->d_name.name, 0, + buffer.name, 0, &lower_path); - dput(match); - } + else + err = -ENOENT; +put_name: + __putname(buffer.name); } /* no error: handle positive dentries */ From 6693b9450021aebb1f00e6dd05e6369420acdec2 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 2 Mar 2017 18:07:21 -0800 Subject: [PATCH 0693/3220] ANDROID: sdcardfs: rate limit warning print Signed-off-by: Daniel Rosenberg Bug: 35848445 Change-Id: Ida72ea0ece191b2ae4a8babae096b2451eb563f6 --- fs/sdcardfs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 68e615045616..aa85fcc8ec1a 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -20,6 +20,7 @@ #include "sdcardfs.h" #include +#include /* Do not directly use this function. Use OVERRIDE_CRED() instead. */ const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs_inode_info *info) @@ -599,7 +600,7 @@ static const char *sdcardfs_follow_link(struct dentry *dentry, void **cookie) static int sdcardfs_permission_wrn(struct inode *inode, int mask) { - WARN(1, "sdcardfs does not support permission. Use permission2.\n"); + WARN_RATELIMIT(1, "sdcardfs does not support permission. Use permission2.\n"); return -EINVAL; } @@ -684,7 +685,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma static int sdcardfs_setattr_wrn(struct dentry *dentry, struct iattr *ia) { - WARN(1, "sdcardfs does not support setattr. User setattr2.\n"); + WARN_RATELIMIT(1, "sdcardfs does not support setattr. User setattr2.\n"); return -EINVAL; } From 650cf58edf0091e149546a75404517e1481c3070 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 2 Mar 2017 15:11:27 -0800 Subject: [PATCH 0694/3220] ANDROID: sdcardfs: Replace get/put with d_lock dput cannot be called with a spin_lock. Instead, we protect our accesses by holding the d_lock. Signed-off-by: Daniel Rosenberg Bug: 35643557 Change-Id: I22cf30856d75b5616cbb0c223724f5ab866b5114 --- fs/sdcardfs/derived_perm.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index ca239a942065..c0d5a9d72bde 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -261,40 +261,48 @@ static int needs_fixup(perm_t perm) { return 0; } -void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit) { +static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit, int depth) +{ struct dentry *child; struct sdcardfs_inode_info *info; - if (!dget(dentry)) - return; + + /* + * All paths will terminate their recursion on hitting PERM_ANDROID_OBB, + * PERM_ANDROID_MEDIA, or PERM_ANDROID_DATA. This happens at a depth of + * at most 3. + */ + WARN(depth > 3, "%s: Max expected depth exceeded!\n", __func__); + spin_lock_nested(&dentry->d_lock, depth); if (!d_inode(dentry)) { - dput(dentry); + spin_unlock(&dentry->d_lock); return; } info = SDCARDFS_I(d_inode(dentry)); if (needs_fixup(info->perm)) { - spin_lock(&dentry->d_lock); list_for_each_entry(child, &dentry->d_subdirs, d_child) { - dget(child); + spin_lock_nested(&child->d_lock, depth + 1); if (!(limit->flags & BY_NAME) || !strncasecmp(child->d_name.name, limit->name, limit->length)) { if (d_inode(child)) { get_derived_permission(dentry, child); fixup_tmp_permissions(d_inode(child)); - dput(child); + spin_unlock(&child->d_lock); break; } } - dput(child); + spin_unlock(&child->d_lock); } - spin_unlock(&dentry->d_lock); } else if (descendant_may_need_fixup(info, limit)) { - spin_lock(&dentry->d_lock); list_for_each_entry(child, &dentry->d_subdirs, d_child) { - fixup_perms_recursive(child, limit); + __fixup_perms_recursive(child, limit, depth + 1); } - spin_unlock(&dentry->d_lock); } - dput(dentry); + spin_unlock(&dentry->d_lock); +} + +void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit) +{ + __fixup_perms_recursive(dentry, limit, 0); } void drop_recursive(struct dentry *parent) { From 371536f070bfb173575a5e69ca475eb7ab34c9f3 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 8 Mar 2017 17:11:51 -0800 Subject: [PATCH 0695/3220] ANDROID: sdcardfs: Use spin_lock_nested Signed-off-by: Daniel Rosenberg Bug: 36007653 Change-Id: I805d5afec797669679853fb2bb993ee38e6276e4 --- fs/sdcardfs/dentry.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index 971928ab6c21..e6f8e9edf8a9 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -76,10 +76,10 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags) if (dentry < lower_dentry) { spin_lock(&dentry->d_lock); - spin_lock(&lower_dentry->d_lock); + spin_lock_nested(&lower_dentry->d_lock, DENTRY_D_LOCK_NESTED); } else { spin_lock(&lower_dentry->d_lock); - spin_lock(&dentry->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); } if (dentry->d_name.len != lower_dentry->d_name.len) { From 2c917ce67144478ef0b5aa6c0c812e4abbfcd6c0 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 8 Mar 2017 17:20:02 -0800 Subject: [PATCH 0696/3220] ANDROID: sdcardfs: Switch to internal case insensitive compare There were still a few places where we called into a case insensitive lookup that was not defined by sdcardfs. Moving them all to the same place will allow us to switch the implementation in the future. Additionally, the check in fixup_perms_recursive did not take into account the length of both strings, causing extraneous matches when the name we were looking for was a prefix of the child name. Signed-off-by: Daniel Rosenberg Change-Id: I45ce768cd782cb4ea1ae183772781387c590ecc2 --- fs/sdcardfs/dentry.c | 8 ++------ fs/sdcardfs/derived_perm.c | 2 +- fs/sdcardfs/packagelist.c | 6 ++---- fs/sdcardfs/sdcardfs.h | 8 ++++++-- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index e6f8e9edf8a9..64494a50e250 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -82,11 +82,7 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags) spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); } - if (dentry->d_name.len != lower_dentry->d_name.len) { - __d_drop(dentry); - err = 0; - } else if (strncasecmp(dentry->d_name.name, lower_dentry->d_name.name, - dentry->d_name.len) != 0) { + if (!qstr_case_eq(&dentry->d_name, &lower_dentry->d_name)) { __d_drop(dentry); err = 0; } @@ -166,7 +162,7 @@ static int sdcardfs_cmp_ci(const struct dentry *parent, } */ if (name->len == len) { - if (strncasecmp(name->name, str, len) == 0) + if (str_n_case_eq(name->name, str, len)) return 0; } return 1; diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index c0d5a9d72bde..925692f0d20c 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -282,7 +282,7 @@ static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search * if (needs_fixup(info->perm)) { list_for_each_entry(child, &dentry->d_subdirs, d_child) { spin_lock_nested(&child->d_lock, depth + 1); - if (!(limit->flags & BY_NAME) || !strncasecmp(child->d_name.name, limit->name, limit->length)) { + if (!(limit->flags & BY_NAME) || qstr_case_eq(&child->d_name, &limit->name)) { if (d_inode(child)) { get_derived_permission(dentry, child); fixup_tmp_permissions(d_inode(child)); diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 56d643f4a9ee..68f8f4571615 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -251,8 +251,7 @@ static void fixup_all_perms_name(const struct qstr *key) struct sdcardfs_sb_info *sbinfo; struct limit_search limit = { .flags = BY_NAME, - .name = key->name, - .length = key->len, + .name = QSTR_INIT(key->name, key->len), }; list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { if (sbinfo_has_sdcard_magic(sbinfo)) @@ -265,8 +264,7 @@ static void fixup_all_perms_name_userid(const struct qstr *key, userid_t userid) struct sdcardfs_sb_info *sbinfo; struct limit_search limit = { .flags = BY_NAME | BY_USERID, - .name = key->name, - .length = key->len, + .name = QSTR_INIT(key->name, key->len), .userid = userid, }; list_for_each_entry(sbinfo, &sdcardfs_super_list, list) { diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 042f989f0bea..0778eb063b63 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -470,8 +470,7 @@ extern void packagelist_exit(void); #define BY_USERID (1 << 1) struct limit_search { unsigned int flags; - const char *name; - size_t length; + struct qstr name; userid_t userid; }; @@ -612,6 +611,11 @@ static inline bool str_case_eq(const char *s1, const char *s2) return !strcasecmp(s1, s2); } +static inline bool str_n_case_eq(const char *s1, const char *s2, size_t len) +{ + return !strncasecmp(s1, s2, len); +} + static inline bool qstr_case_eq(const struct qstr *q1, const struct qstr *q2) { return q1->len == q2->len && str_case_eq(q1->name, q2->name); From a5504f851af9c9897b7cde886650fb56c7e6443f Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 8 Mar 2017 17:45:46 -0800 Subject: [PATCH 0697/3220] ANDROID: sdcardfs: Use d_invalidate instead of drop_recurisve drop_recursive did not properly remove stale dentries. Instead, we use the vfs's d_invalidate, which does the proper cleanup. Additionally, remove the no longer used drop_recursive, and fixup_top_recursive that that are no longer used. Signed-off-by: Daniel Rosenberg Change-Id: Ibff61b0c34b725b024a050169047a415bc90f0d8 --- fs/sdcardfs/derived_perm.c | 38 -------------------------------------- fs/sdcardfs/inode.c | 2 +- fs/sdcardfs/sdcardfs.h | 2 -- 3 files changed, 1 insertion(+), 41 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 925692f0d20c..4b365b95b437 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -305,44 +305,6 @@ void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit) __fixup_perms_recursive(dentry, limit, 0); } -void drop_recursive(struct dentry *parent) { - struct dentry *dentry; - struct sdcardfs_inode_info *info; - if (!d_inode(parent)) - return; - info = SDCARDFS_I(d_inode(parent)); - spin_lock(&parent->d_lock); - list_for_each_entry(dentry, &parent->d_subdirs, d_child) { - if (d_inode(dentry)) { - if (SDCARDFS_I(d_inode(parent))->top != SDCARDFS_I(d_inode(dentry))->top) { - drop_recursive(dentry); - d_drop(dentry); - } - } - } - spin_unlock(&parent->d_lock); -} - -void fixup_top_recursive(struct dentry *parent) { - struct dentry *dentry; - struct sdcardfs_inode_info *info; - - if (!d_inode(parent)) - return; - info = SDCARDFS_I(d_inode(parent)); - spin_lock(&parent->d_lock); - list_for_each_entry(dentry, &parent->d_subdirs, d_child) { - if (d_inode(dentry)) { - if (SDCARDFS_I(d_inode(parent))->top != SDCARDFS_I(d_inode(dentry))->top) { - get_derived_permission(parent, dentry); - fixup_tmp_permissions(d_inode(dentry)); - fixup_top_recursive(dentry); - } - } - } - spin_unlock(&parent->d_lock); -} - /* main function for updating derived permission */ inline void update_derived_permission_lock(struct dentry *dentry) { diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index aa85fcc8ec1a..8d0875ff5710 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -529,7 +529,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, get_derived_permission_new(new_dentry->d_parent, old_dentry, &new_dentry->d_name); fixup_tmp_permissions(d_inode(old_dentry)); fixup_lower_ownership(old_dentry, new_dentry->d_name.name); - drop_recursive(old_dentry); /* Can't fixup ownership recursively :( */ + d_invalidate(old_dentry); /* Can't fixup ownership recursively :( */ out: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_old_dir_dentry); diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 0778eb063b63..e28a7dd47ebb 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -478,8 +478,6 @@ extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t useri uid_t uid, bool under_android, struct inode *top); extern void get_derived_permission(struct dentry *parent, struct dentry *dentry); extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name); -extern void drop_recursive(struct dentry *parent); -extern void fixup_top_recursive(struct dentry *parent); extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit); extern void update_derived_permission_lock(struct dentry *dentry); From f84495e490a5ddf5f06632c7002f5e498dea6a9e Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 9 Mar 2017 18:12:16 -0800 Subject: [PATCH 0698/3220] ANDROID: sdcardfs: Get the blocksize from the lower fs This changes sdcardfs to be more in line with the getattr in wrapfs, which calls the lower fs's getattr to get the block size Signed-off-by: Daniel Rosenberg Bug: 34723223 Change-Id: I1c9e16604ba580a8cdefa17f02dcc489d7351aed --- fs/sdcardfs/inode.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 8d0875ff5710..f713fad909d8 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -856,9 +856,7 @@ static int sdcardfs_fillattr(struct vfsmount *mnt, struct inode *inode, struct k static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct dentry *lower_dentry; - struct inode *inode; - struct inode *lower_inode; + struct kstat lower_stat; struct path lower_path; struct dentry *parent; int err; @@ -873,16 +871,15 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, } dput(parent); - inode = d_inode(dentry); - sdcardfs_get_lower_path(dentry, &lower_path); - lower_dentry = lower_path.dentry; - lower_inode = sdcardfs_lower_inode(inode); - - sdcardfs_copy_and_fix_attrs(inode, lower_inode); - fsstack_copy_inode_size(inode, lower_inode); - - err = sdcardfs_fillattr(mnt, inode, stat); + err = vfs_getattr(&lower_path, &lower_stat); + if (err) + goto out; + sdcardfs_copy_and_fix_attrs(d_inode(dentry), + d_inode(lower_path.dentry)); + err = sdcardfs_fillattr(mnt, d_inode(dentry), stat); + stat->blocks = lower_stat.blocks; +out: sdcardfs_put_lower_path(dentry, &lower_path); return err; } From 1e2f5dbfa3ab99eb764f5d0b1abc64946c3e0e2b Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 9 Mar 2017 20:59:18 -0800 Subject: [PATCH 0699/3220] ANDROID: sdcardfs: declare MODULE_ALIAS_FS From commit ee616b78aa87 ("Wrapfs: declare MODULE_ALIAS_FS") Signed-off-by: Daniel Rosenberg bug: 35766959 Change-Id: Ia4728ab49d065b1d2eb27825046f14b97c328cba --- fs/sdcardfs/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 7a8eae29e44d..4e2aded8d1d9 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -432,6 +432,7 @@ static struct file_system_type sdcardfs_fs_type = { .kill_sb = sdcardfs_kill_sb, .fs_flags = 0, }; +MODULE_ALIAS_FS(SDCARDFS_NAME); static int __init init_sdcardfs_fs(void) { From 8dbb44c8aa4a47430b42500c3e8a482121162024 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 10 Mar 2017 12:39:42 -0800 Subject: [PATCH 0700/3220] ANDROID: sdcardfs: Use case insensitive hash function Case insensitive comparisons don't help us much if we hash to different buckets... Signed-off-by: Daniel Rosenberg bug: 36004503 Change-Id: I91e00dbcd860a709cbd4f7fd7fc6d855779f3285 --- fs/sdcardfs/packagelist.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 68f8f4571615..e72fe83f7837 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -20,6 +20,7 @@ #include "sdcardfs.h" #include +#include #include #include #include @@ -44,10 +45,18 @@ static DEFINE_HASHTABLE(ext_to_groupid, 8); static struct kmem_cache *hashtable_entry_cachep; +static unsigned int full_name_case_hash(const unsigned char *name, unsigned int len) +{ + unsigned long hash = init_name_hash(); + while (len--) + hash = partial_name_hash(tolower(*name++), hash); + return end_name_hash(hash); +} + static void inline qstr_init(struct qstr *q, const char *name) { q->name = name; q->len = strlen(q->name); - q->hash = full_name_hash(q->name, q->len); + q->hash = full_name_case_hash(q->name, q->len); } static inline int qstr_copy(const struct qstr *src, struct qstr *dest) { From 1a736af098a35e3eb42b28f92384cc18e47200bf Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 10 Mar 2017 13:54:30 -0800 Subject: [PATCH 0701/3220] ANDROID: sdcardfs: move path_put outside of spinlock Signed-off-by: Daniel Rosenberg Bug: 35643557 Change-Id: Ib279ebd7dd4e5884d184d67696a93e34993bc1ef --- fs/sdcardfs/derived_perm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 4b365b95b437..dba58d8c9d60 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -357,6 +357,8 @@ int is_obbpath_invalid(struct dentry *dent) struct sdcardfs_dentry_info *di = SDCARDFS_D(dent); struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dent->d_sb); char *path_buf, *obbpath_s; + int need_put = 0; + struct path lower_path; /* check the base obbpath has been changed. * this routine can check an uninitialized obb dentry as well. @@ -383,10 +385,13 @@ int is_obbpath_invalid(struct dentry *dent) } //unlock_dir(lower_parent); - path_put(&di->lower_path); + pathcpy(&lower_path, &di->lower_path); + need_put = 1; } } spin_unlock(&di->lock); + if (need_put) + path_put(&lower_path); return ret; } From f1e5d0086840d504b12acfb958f19be932bab7ad Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 10 Mar 2017 18:58:25 -0800 Subject: [PATCH 0702/3220] ANDROID: sdcardfs: Remove uninformative prints At best these prints do not provide useful information, and at worst, some allow userspace to abuse the kernel log. Signed-off-by: Daniel Rosenberg Bug: 36138424 Change-Id: I812c57cc6a22b37262935ab77f48f3af4c36827e --- fs/sdcardfs/derived_perm.c | 1 - fs/sdcardfs/file.c | 3 --- fs/sdcardfs/inode.c | 24 +----------------------- fs/sdcardfs/lookup.c | 3 --- 4 files changed, 1 insertion(+), 30 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index dba58d8c9d60..763f10340487 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -437,7 +437,6 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path) if(!err) { /* the obbpath base has been found */ - printk(KERN_INFO "sdcardfs: the sbi->obbpath is found\n"); pathcpy(lower_path, &obbpath); } else { /* if the sbi->obbpath is not available, we can optionally diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 23f8cd7f8877..0592facef704 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -217,9 +217,6 @@ static int sdcardfs_open(struct inode *inode, struct file *file) } if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { - printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" - " dentry: %s, task:%s\n", - __func__, dentry->d_name.name, current->comm); err = -EACCES; goto out_err; } diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index f713fad909d8..f19f11ea19fc 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -68,9 +68,6 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, struct fs_struct *copied_fs; if(!check_caller_access_to_name(dir, &dentry->d_name)) { - printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" - " dentry: %s, task:%s\n", - __func__, dentry->d_name.name, current->comm); err = -EACCES; goto out_eacces; } @@ -170,9 +167,6 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) const struct cred *saved_cred = NULL; if(!check_caller_access_to_name(dir, &dentry->d_name)) { - printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" - " dentry: %s, task:%s\n", - __func__, dentry->d_name.name, current->comm); err = -EACCES; goto out_eacces; } @@ -280,9 +274,6 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct qstr q_data = QSTR_LITERAL("data"); if(!check_caller_access_to_name(dir, &dentry->d_name)) { - printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" - " dentry: %s, task:%s\n", - __func__, dentry->d_name.name, current->comm); err = -EACCES; goto out_eacces; } @@ -392,9 +383,6 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) const struct cred *saved_cred = NULL; if(!check_caller_access_to_name(dir, &dentry->d_name)) { - printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" - " dentry: %s, task:%s\n", - __func__, dentry->d_name.name, current->comm); err = -EACCES; goto out_eacces; } @@ -481,9 +469,6 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, if(!check_caller_access_to_name(old_dir, &old_dentry->d_name) || !check_caller_access_to_name(new_dir, &new_dentry->d_name)) { - printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" - " new_dentry: %s, task:%s\n", - __func__, new_dentry->d_name.name, current->comm); err = -EACCES; goto out_eacces; } @@ -746,12 +731,8 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct if (!err) { /* check the Android group ID */ parent = dget_parent(dentry); - if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { - printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" - " dentry: %s, task:%s\n", - __func__, dentry->d_name.name, current->comm); + if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) err = -EACCES; - } dput(parent); } @@ -863,9 +844,6 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, parent = dget_parent(dentry); if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { - printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" - " dentry: %s, task:%s\n", - __func__, dentry->d_name.name, current->comm); dput(parent); return -EACCES; } diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index bbfb0775c700..fffb94c923c4 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -395,9 +395,6 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { ret = ERR_PTR(-EACCES); - printk(KERN_INFO "%s: need to check the caller's gid in packages.list\n" - " dentry: %s, task:%s\n", - __func__, dentry->d_name.name, current->comm); goto out_err; } From 003515b560db54d22424d91a5b5a626b42634ebd Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 13 Mar 2017 13:53:54 -0700 Subject: [PATCH 0703/3220] ANDROID: sdcardfs: Use tabs instead of spaces in multiuser.h Signed-off-by: Daniel Rosenberg Bug: 35331000 Change-Id: Ic7801914a7dd377e270647f81070020e1f0bab9b --- fs/sdcardfs/multiuser.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h index 52bc20080904..ca141ff40b49 100644 --- a/fs/sdcardfs/multiuser.h +++ b/fs/sdcardfs/multiuser.h @@ -29,21 +29,21 @@ typedef uid_t userid_t; typedef uid_t appid_t; static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id) { - return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET); + return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET); } static inline gid_t multiuser_get_cache_gid(userid_t user_id, appid_t app_id) { - if (app_id >= AID_APP_START && app_id <= AID_APP_END) { - return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_CACHE_GID_START); - } else { - return -1; - } + if (app_id >= AID_APP_START && app_id <= AID_APP_END) { + return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_CACHE_GID_START); + } else { + return -1; + } } static inline gid_t multiuser_get_ext_gid(userid_t user_id, appid_t app_id) { - if (app_id >= AID_APP_START && app_id <= AID_APP_END) { - return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_EXT_GID_START); - } else { - return -1; - } + if (app_id >= AID_APP_START && app_id <= AID_APP_END) { + return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_EXT_GID_START); + } else { + return -1; + } } From d0b44039aa8477151b4630aae61abb5764c981b8 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 13 Mar 2017 15:34:03 -0700 Subject: [PATCH 0704/3220] ANDROID: sdcardfs: Fix gid issue We were already calculating most of these values, and erroring out because the check was confused by this. Instead of recalculating, adjust it as needed. Signed-off-by: Daniel Rosenberg Bug: 36160015 Change-Id: I9caf3e2fd32ca2e37ff8ed71b1d392f1761bc9a9 --- fs/sdcardfs/derived_perm.c | 4 ++-- fs/sdcardfs/multiuser.h | 21 ++++++++------------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 763f10340487..28e9b8d42f9e 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -205,13 +205,13 @@ void fixup_lower_ownership(struct dentry* dentry, const char *name) { break; case PERM_ANDROID_PACKAGE: if (info->d_uid != 0) - gid = multiuser_get_ext_gid(info->userid, info->d_uid); + gid = multiuser_get_ext_gid(info->d_uid); else gid = multiuser_get_uid(info->userid, uid); break; case PERM_ANDROID_PACKAGE_CACHE: if (info->d_uid != 0) - gid = multiuser_get_cache_gid(info->userid, info->d_uid); + gid = multiuser_get_cache_gid(info->d_uid); else gid = multiuser_get_uid(info->userid, uid); break; diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h index ca141ff40b49..2e89b5872314 100644 --- a/fs/sdcardfs/multiuser.h +++ b/fs/sdcardfs/multiuser.h @@ -28,22 +28,17 @@ typedef uid_t userid_t; typedef uid_t appid_t; -static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id) { +static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id) +{ return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET); } -static inline gid_t multiuser_get_cache_gid(userid_t user_id, appid_t app_id) { - if (app_id >= AID_APP_START && app_id <= AID_APP_END) { - return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_CACHE_GID_START); - } else { - return -1; - } +static inline gid_t multiuser_get_cache_gid(uid_t uid) +{ + return uid - AID_APP_START + AID_CACHE_GID_START; } -static inline gid_t multiuser_get_ext_gid(userid_t user_id, appid_t app_id) { - if (app_id >= AID_APP_START && app_id <= AID_APP_END) { - return multiuser_get_uid(user_id, (app_id - AID_APP_START) + AID_EXT_GID_START); - } else { - return -1; - } +static inline gid_t multiuser_get_ext_gid(uid_t uid) +{ + return uid - AID_APP_START + AID_EXT_GID_START; } From 4b20ed9406bc88023353f04e07788ed4c3846aa2 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 14 Mar 2017 15:39:05 -0700 Subject: [PATCH 0705/3220] ANDROID: vfs: user permission2 in notify_change2 This allows filesystems to use their mount private data to influence the permissions they use when attempting to touch. Signed-off-by: Daniel Rosenberg Bug: 36228261 Change-Id: I1052319ba1c3ce5d5e586aa7f8a80c08851a5c7f --- fs/attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/attr.c b/fs/attr.c index 11be2265a2d5..c86b37c38fb7 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -211,7 +211,7 @@ int notify_change2(struct vfsmount *mnt, struct dentry * dentry, struct iattr * return -EPERM; if (!inode_owner_or_capable(inode)) { - error = inode_permission(inode, MAY_WRITE); + error = inode_permission2(mnt, inode, MAY_WRITE); if (error) return error; } From 75a7736c2ad9b019ac131f124f53f3c194bf3c95 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 13 Mar 2017 12:22:21 -0700 Subject: [PATCH 0706/3220] uid_sys_stats: change to use rt_mutex We see this happens multiple times in heavy workload in systrace and AMS stuck in uid_lock. Running process: Process 953 Running thread: android.ui State: Uninterruptible Sleep Start: 1,025.628 ms Duration: 27,955.949 ms On CPU: Running instead: system_server Args: {kernel callsite when blocked:: "uid_procstat_write+0xb8/0x144"} Changing to rt_mutex can mitigate the priority inversion Bug: 34991231 Bug: 34193533 Test: on marlin Change-Id: I28eb3971331cea60b1075740c792ab87d103262c Signed-off-by: Wei Wang --- drivers/misc/uid_sys_stats.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 4988e323cf02..204b23484266 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -29,7 +30,7 @@ #define UID_HASH_BITS 10 DECLARE_HASHTABLE(hash_table, UID_HASH_BITS); -static DEFINE_MUTEX(uid_lock); +static DEFINE_RT_MUTEX(uid_lock); static struct proc_dir_entry *cpu_parent; static struct proc_dir_entry *io_parent; static struct proc_dir_entry *proc_parent; @@ -98,7 +99,7 @@ static int uid_cputime_show(struct seq_file *m, void *v) cputime_t stime; unsigned long bkt; - mutex_lock(&uid_lock); + rt_mutex_lock(&uid_lock); hash_for_each(hash_table, bkt, uid_entry, hash) { uid_entry->active_stime = 0; @@ -111,7 +112,7 @@ static int uid_cputime_show(struct seq_file *m, void *v) current_user_ns(), task_uid(task))); if (!uid_entry) { read_unlock(&tasklist_lock); - mutex_unlock(&uid_lock); + rt_mutex_unlock(&uid_lock); pr_err("%s: failed to find the uid_entry for uid %d\n", __func__, from_kuid_munged(current_user_ns(), task_uid(task))); @@ -135,7 +136,7 @@ static int uid_cputime_show(struct seq_file *m, void *v) cputime_to_jiffies(total_stime)) * USEC_PER_MSEC); } - mutex_unlock(&uid_lock); + rt_mutex_unlock(&uid_lock); return 0; } @@ -182,7 +183,7 @@ static ssize_t uid_remove_write(struct file *file, kstrtol(end_uid, 10, &uid_end) != 0) { return -EINVAL; } - mutex_lock(&uid_lock); + rt_mutex_lock(&uid_lock); for (; uid_start <= uid_end; uid_start++) { hash_for_each_possible_safe(hash_table, uid_entry, tmp, @@ -194,7 +195,7 @@ static ssize_t uid_remove_write(struct file *file, } } - mutex_unlock(&uid_lock); + rt_mutex_unlock(&uid_lock); return count; } @@ -243,7 +244,7 @@ static void update_io_stats_locked(void) struct io_stats *io_bucket, *io_curr, *io_last; unsigned long bkt; - BUG_ON(!mutex_is_locked(&uid_lock)); + BUG_ON(!rt_mutex_is_locked(&uid_lock)); hash_for_each(hash_table, bkt, uid_entry, hash) memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, @@ -285,7 +286,7 @@ static int uid_io_show(struct seq_file *m, void *v) struct uid_entry *uid_entry; unsigned long bkt; - mutex_lock(&uid_lock); + rt_mutex_lock(&uid_lock); update_io_stats_locked(); @@ -304,7 +305,7 @@ static int uid_io_show(struct seq_file *m, void *v) uid_entry->io[UID_STATE_BACKGROUND].fsync); } - mutex_unlock(&uid_lock); + rt_mutex_unlock(&uid_lock); return 0; } @@ -349,16 +350,16 @@ static ssize_t uid_procstat_write(struct file *file, if (state != UID_STATE_BACKGROUND && state != UID_STATE_FOREGROUND) return -EINVAL; - mutex_lock(&uid_lock); + rt_mutex_lock(&uid_lock); uid_entry = find_or_register_uid(uid); if (!uid_entry) { - mutex_unlock(&uid_lock); + rt_mutex_unlock(&uid_lock); return -EINVAL; } if (uid_entry->state == state) { - mutex_unlock(&uid_lock); + rt_mutex_unlock(&uid_lock); return count; } @@ -366,7 +367,7 @@ static ssize_t uid_procstat_write(struct file *file, uid_entry->state = state; - mutex_unlock(&uid_lock); + rt_mutex_unlock(&uid_lock); return count; } @@ -388,7 +389,7 @@ static int process_notifier(struct notifier_block *self, if (!task) return NOTIFY_OK; - mutex_lock(&uid_lock); + rt_mutex_lock(&uid_lock); uid = from_kuid_munged(current_user_ns(), task_uid(task)); uid_entry = find_or_register_uid(uid); if (!uid_entry) { @@ -404,7 +405,7 @@ static int process_notifier(struct notifier_block *self, clean_uid_io_last_stats(uid_entry, task); exit: - mutex_unlock(&uid_lock); + rt_mutex_unlock(&uid_lock); return NOTIFY_OK; } From 870382b806a424c63a02b43a3195953b833395f3 Mon Sep 17 00:00:00 2001 From: Bowgo Tsai Date: Thu, 2 Mar 2017 18:54:15 +0800 Subject: [PATCH 0707/3220] ANDROID: dm: android-verity: allow disable dm-verity for Treble VTS To start Treble VTS test, a single AOSP system.img will be flashed onto the device. The size of AOSP system.img might be different than the system partition size on device, making locating verity metadata fail (at the last fixed size of the partition). This change allows disabling dm-verity on system partition when the device is unlocked (orange device state) with invalid metadata. BUG: 35603549 Test: boot device with a different-sized system.img, checks verity is not enabled via: "adb shell getprop | grep partition.system.verified" Change-Id: Ide78dca4eefde4ab019e4b202d3f590dcb1bb506 Signed-off-by: Bowgo Tsai --- drivers/md/dm-android-verity.c | 53 ++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index ec0a4d19ca3e..c3c9502baf18 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -115,6 +115,12 @@ static inline bool is_userdebug(void) return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug)); } +static inline bool is_unlocked(void) +{ + static const char unlocked[] = "orange"; + + return !strncmp(verifiedbootstate, unlocked, sizeof(unlocked)); +} static int table_extract_mpi_array(struct public_key_signature *pks, const void *data, size_t len) @@ -650,6 +656,28 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) return err; } +static int create_linear_device(struct dm_target *ti, dev_t dev, + char *target_device) +{ + u64 device_size = 0; + int err = find_size(dev, &device_size); + + if (err) { + DMERR("error finding bdev size"); + handle_error(); + return err; + } + + ti->len = device_size; + err = add_as_linear_device(ti, target_device); + if (err) { + handle_error(); + return err; + } + verity_enabled = false; + return 0; +} + /* * Target parameters: * Key id of the public key in the system keyring. @@ -673,7 +701,6 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) struct fec_ecc_metadata uninitialized_var(ecc); char buf[FEC_ARG_LENGTH], *buf_ptr; unsigned long long tmpll; - u64 uninitialized_var(device_size); if (argc == 1) { /* Use the default keyid */ @@ -701,23 +728,8 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - if (is_eng()) { - err = find_size(dev, &device_size); - if (err) { - DMERR("error finding bdev size"); - handle_error(); - return err; - } - - ti->len = device_size; - err = add_as_linear_device(ti, target_device); - if (err) { - handle_error(); - return err; - } - verity_enabled = false; - return 0; - } + if (is_eng()) + return create_linear_device(ti, dev, target_device); strreplace(key_id, '#', ' '); @@ -732,6 +744,11 @@ static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv) err = extract_metadata(dev, &fec, &metadata, &verity_enabled); if (err) { + /* Allow invalid metadata when the device is unlocked */ + if (is_unlocked()) { + DMWARN("Allow invalid metadata when unlocked"); + return create_linear_device(ti, dev, target_device); + } DMERR("Error while extracting metadata"); handle_error(); goto free_metadata; From 152a401d269358de1875789a5d4e85ea8fc3775b Mon Sep 17 00:00:00 2001 From: yangdongdong Date: Sat, 8 Aug 2015 11:59:59 +0800 Subject: [PATCH 0708/3220] ANDROID: power: align wakeup_sources format This aligns every column of elements in wakeup_sources to conveniently check any specific column for suspicious power consumption wakeup source or for other easily human readable purpose. Change-Id: Iac8b0538170fcc0cca9f6857c15d9a4c62c8865e Signed-off-by: yangdongdong --- drivers/base/power/wakeup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 09c07f519952..0e494108c20c 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -1042,7 +1042,7 @@ static int print_wakeup_source_stats(struct seq_file *m, active_time = ktime_set(0, 0); } - seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n", + seq_printf(m, "%-32s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n", ws->name, active_count, ws->event_count, ws->wakeup_count, ws->expire_count, ktime_to_ms(active_time), ktime_to_ms(total_time), @@ -1062,7 +1062,7 @@ static int wakeup_sources_stats_show(struct seq_file *m, void *unused) { struct wakeup_source *ws; - seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t" + seq_puts(m, "name\t\t\t\t\tactive_count\tevent_count\twakeup_count\t" "expire_count\tactive_since\ttotal_time\tmax_time\t" "last_change\tprevent_suspend_time\n"); From 2474d8bad0034d4c1143e10947c44a0d6b37dc92 Mon Sep 17 00:00:00 2001 From: Max Shi Date: Fri, 26 Aug 2016 15:00:16 -0700 Subject: [PATCH 0709/3220] config: disable CONFIG_USELIB and CONFIG_FHANDLE turn off the two kernel configs to disable related system ABI. Bug: 30903194 Change-Id: I32e2ff3323135ce4b67a86f106fa9327a71fe309 Signed-off-by: Max Shi --- android/configs/android-base.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 2098fe97198c..d1f9628e4377 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -1,10 +1,12 @@ # KEEP ALPHABETICALLY SORTED # CONFIG_DEVKMEM is not set # CONFIG_DEVMEM is not set +# CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set # CONFIG_MODULES is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set +# CONFIG_USELIB is not set CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_LOW_MEMORY_KILLER=y From a5e2a1ddbc50f49135a381137fde4eaee12e5071 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Tue, 7 Mar 2017 10:37:56 -0800 Subject: [PATCH 0710/3220] ANDROID: sched: fix duplicate sched_group_energy const specifiers EAS uses "const struct sched_group_energy * const" fairly consistently. But a couple of places swap the "*" and second "const", making the pointer mutable. In the case of struct sched_group, "* const" would have been an error, since init_sched_energy() writes to sd->groups->sge. Change-Id: Ic6a8fcf99e65c0f25d9cc55c32625ef3ca5c9aca Signed-off-by: Greg Hackmann --- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3331f453a17f..83cfb72b2d95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4919,7 +4919,7 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg) } static int find_new_capacity(struct energy_env *eenv, - const struct sched_group_energy const *sge) + const struct sched_group_energy * const sge) { int idx; unsigned long util = group_max_util(eenv); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2f2b959ad244..780522c65cea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -915,7 +915,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; - const struct sched_group_energy const *sge; + const struct sched_group_energy *sge; /* * The CPUs this group covers. From b631d8c06f64f744f272bedbe8839f8d687db6da Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 20 Mar 2017 14:06:27 -0700 Subject: [PATCH 0711/3220] ANDROID: android-verity: do not compile as independent module dm-android-verity depends on optional kernel command line parameters. When compiled as module the __setup macro ends up being a no-op resulting in the following warnings: /work/build/batch/drivers/md/dm-android-verity.c:91:19: warning: 'verity_buildvariant' defined but not used [-Wunused-function] static int __init verity_buildvariant(char *line) ^~~~~~~~~~~~~~~~~~~ /work/build/batch/drivers/md/dm-android-verity.c:83:19: warning: 'verity_keyid_param' defined but not used [-Wunused-function] static int __init verity_keyid_param(char *line) ^~~~~~~~~~~~~~~~~~ /work/build/batch/drivers/md/dm-android-verity.c:75:19: warning: 'verity_mode_param' defined but not used [-Wunused-function] static int __init verity_mode_param(char *line) ^~~~~~~~~~~~~~~~~ /work/build/batch/drivers/md/dm-android-verity.c:67:19: warning: 'verified_boot_state_param' defined but not used [-Wunused-function] static int __init verified_boot_state_param(char *line) ^~~~~~~~~~~~~~~~~~~~~~~~~ Tested with allmodconfig. Change-Id: Idfe0c97b216bb620cc7264e968b494eb3a765157 Signed-off-by: Badhri Jagan Sridharan --- drivers/md/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3d237a03dab3..9eb08a43cd27 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -516,15 +516,15 @@ config DM_LOG_WRITES If unsure, say N. config DM_ANDROID_VERITY - tristate "Android verity target support" - depends on DM_VERITY + bool "Android verity target support" + depends on DM_VERITY=y depends on X509_CERTIFICATE_PARSER depends on SYSTEM_TRUSTED_KEYRING depends on PUBLIC_KEY_ALGO_RSA depends on KEYS depends on ASYMMETRIC_KEY_TYPE depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE - depends on MD_LINEAR + depends on MD_LINEAR=y select DM_VERITY_HASH_PREFETCH_MIN_SIZE_128 ---help--- This device-mapper target is virtually a VERITY target. This From bc5b6dd5dfd838f7338e8d1ae42f670adc1a4d03 Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Thu, 22 Dec 2016 12:37:34 +0900 Subject: [PATCH 0712/3220] BACKPORT: mmc: core: Export device lifetime information through sysfs In the eMMC 5.0 version of the spec, several EXT_CSD fields about device lifetime are added. - Two types of estimated indications reflected by averaged wear out of memory - An indication reflected by average reserved blocks Export the information through sysfs. Signed-off-by: Jungseung Lee Reviewed-by: Jaehoon Chung Reviewed-by: Shawn Lin Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 12 ++++++++++++ include/linux/mmc/card.h | 3 +++ include/linux/mmc/mmc.h | 3 +++ 3 files changed, 18 insertions(+) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 79a0c26e1419..0e3d7152c8af 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -592,6 +592,12 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd) card->ext_csd.ffu_capable = (ext_csd[EXT_CSD_SUPPORTED_MODE] & 0x1) && !(ext_csd[EXT_CSD_FW_CONFIG] & 0x1); + + card->ext_csd.pre_eol_info = ext_csd[EXT_CSD_PRE_EOL_INFO]; + card->ext_csd.device_life_time_est_typ_a = + ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A]; + card->ext_csd.device_life_time_est_typ_b = + ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B]; } out: return err; @@ -721,6 +727,10 @@ MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid); MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name); MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid); MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv); +MMC_DEV_ATTR(pre_eol_info, "%02x\n", card->ext_csd.pre_eol_info); +MMC_DEV_ATTR(life_time, "0x%02x 0x%02x\n", + card->ext_csd.device_life_time_est_typ_a, + card->ext_csd.device_life_time_est_typ_b); MMC_DEV_ATTR(serial, "0x%08x\n", card->cid.serial); MMC_DEV_ATTR(enhanced_area_offset, "%llu\n", card->ext_csd.enhanced_area_offset); @@ -757,6 +767,8 @@ static struct attribute *mmc_std_attrs[] = { &dev_attr_name.attr, &dev_attr_oemid.attr, &dev_attr_prv.attr, + &dev_attr_pre_eol_info.attr, + &dev_attr_life_time.attr, &dev_attr_serial.attr, &dev_attr_enhanced_area_offset.attr, &dev_attr_enhanced_area_size.attr, diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index eb0151bac50c..8f23fb2c5ed2 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -118,6 +118,9 @@ struct mmc_ext_csd { u8 raw_pwr_cl_ddr_200_360; /* 253 */ u8 raw_bkops_status; /* 246 */ u8 raw_sectors[4]; /* 212 - 4 bytes */ + u8 pre_eol_info; /* 267 */ + u8 device_life_time_est_typ_a; /* 268 */ + u8 device_life_time_est_typ_b; /* 269 */ unsigned int feature_support; #define MMC_DISCARD_FEATURE BIT(0) /* CMD38 feature */ diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index 15f2c4a0a62c..2c6b1d45626e 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -330,6 +330,9 @@ struct _mmc_csd { #define EXT_CSD_CACHE_SIZE 249 /* RO, 4 bytes */ #define EXT_CSD_PWR_CL_DDR_200_360 253 /* RO */ #define EXT_CSD_FIRMWARE_VERSION 254 /* RO, 8 bytes */ +#define EXT_CSD_PRE_EOL_INFO 267 /* RO */ +#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A 268 /* RO */ +#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B 269 /* RO */ #define EXT_CSD_SUPPORTED_MODE 493 /* RO */ #define EXT_CSD_TAG_UNIT_SIZE 498 /* RO */ #define EXT_CSD_DATA_TAG_SUPPORT 499 /* RO */ From 659da09c3d7af03ea57dbb6320e7899d89a7e403 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Tue, 21 Mar 2017 11:43:02 -0700 Subject: [PATCH 0713/3220] ANDROID: mmc: core: export emmc revision Change-Id: If23bc838327ef751ee65baadc429e218eaa2a848 Signed-off-by: Jin Qian --- drivers/mmc/core/mmc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 0e3d7152c8af..83b9bf880834 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -727,6 +727,7 @@ MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid); MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name); MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid); MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv); +MMC_DEV_ATTR(rev, "0x%x\n", card->ext_csd.rev); MMC_DEV_ATTR(pre_eol_info, "%02x\n", card->ext_csd.pre_eol_info); MMC_DEV_ATTR(life_time, "0x%02x 0x%02x\n", card->ext_csd.device_life_time_est_typ_a, @@ -767,6 +768,7 @@ static struct attribute *mmc_std_attrs[] = { &dev_attr_name.attr, &dev_attr_oemid.attr, &dev_attr_prv.attr, + &dev_attr_rev.attr, &dev_attr_pre_eol_info.attr, &dev_attr_life_time.attr, &dev_attr_serial.attr, From e953f89b8563efe0b92f70033d237874c43d4a3d Mon Sep 17 00:00:00 2001 From: Joel Scherpelz Date: Wed, 22 Mar 2017 18:19:04 +0900 Subject: [PATCH 0714/3220] net: ipv6: Add sysctl for minimum prefix len acceptable in RIOs. This commit adds a new sysctl accept_ra_rt_info_min_plen that defines the minimum acceptable prefix length of Route Information Options. The new sysctl is intended to be used together with accept_ra_rt_info_max_plen to configure a range of acceptable prefix lengths. It is useful to prevent misconfigurations from unintentionally blackholing too much of the IPv6 address space (e.g., home routers announcing RIOs for fc00::/7, which is incorrect). [backport of net-next bbea124bc99df968011e76eba105fe964a4eceab] Bug: 33333670 Test: net_test passes Signed-off-by: Joel Scherpelz Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 13 +++++++++++-- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 10 ++++++++++ include/uapi/linux/sysctl.h | 1 + net/ipv6/addrconf.c | 10 ++++++++++ net/ipv6/ndisc.c | 2 ++ 6 files changed, 35 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 2042261408b9..5f1ea84ed72b 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1413,11 +1413,20 @@ accept_ra_pinfo - BOOLEAN Functional default: enabled if accept_ra is enabled. disabled if accept_ra is disabled. +accept_ra_rt_info_min_plen - INTEGER + Minimum prefix length of Route Information in RA. + + Route Information w/ prefix smaller than this variable shall + be ignored. + + Functional default: 0 if accept_ra_rtr_pref is enabled. + -1 if accept_ra_rtr_pref is disabled. + accept_ra_rt_info_max_plen - INTEGER Maximum prefix length of Route Information in RA. - Route Information w/ prefix larger than or equal to this - variable shall be ignored. + Route Information w/ prefix larger than this variable shall + be ignored. Functional default: 0 if accept_ra_rtr_pref is enabled. -1 if accept_ra_rtr_pref is disabled. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ce777260e9ea..1182f0e21697 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -36,6 +36,7 @@ struct ipv6_devconf { __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; #ifdef CONFIG_IPV6_ROUTE_INFO + __s32 accept_ra_rt_info_min_plen; __s32 accept_ra_rt_info_max_plen; #endif #endif diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 2b1533859749..c462f1dc175e 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -175,6 +175,16 @@ enum { DEVCONF_USE_OIF_ADDRS_ONLY, DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT, DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, + DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, + DEVCONF_DROP_UNSOLICITED_NA, + DEVCONF_KEEP_ADDR_ON_DOWN, + DEVCONF_RTR_SOLICIT_MAX_INTERVAL, + DEVCONF_SEG6_ENABLED, + DEVCONF_SEG6_REQUIRE_HMAC, + DEVCONF_ENHANCED_DAD, + DEVCONF_ADDR_GEN_MODE, + DEVCONF_DISABLE_POLICY, + DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN, DEVCONF_MAX }; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 0956373b56db..d18980e74534 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -570,6 +570,7 @@ enum { NET_IPV6_PROXY_NDP=23, NET_IPV6_ACCEPT_SOURCE_ROUTE=25, NET_IPV6_ACCEPT_RA_FROM_LOCAL=26, + NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27, __NET_IPV6_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 498a664b8dc9..860ffbe778cf 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -202,6 +202,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_ra_rtr_pref = 1, .rtr_probe_interval = 60 * HZ, #ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_min_plen = 0, .accept_ra_rt_info_max_plen = 0, #endif #endif @@ -247,6 +248,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_ra_rtr_pref = 1, .rtr_probe_interval = 60 * HZ, #ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_min_plen = 0, .accept_ra_rt_info_max_plen = 0, #endif #endif @@ -4689,6 +4691,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_RTR_PROBE_INTERVAL] = jiffies_to_msecs(cnf->rtr_probe_interval); #ifdef CONFIG_IPV6_ROUTE_INFO + array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen; array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; #endif #endif @@ -5648,6 +5651,13 @@ static struct addrconf_sysctl_table .proc_handler = proc_dointvec_jiffies, }, #ifdef CONFIG_IPV6_ROUTE_INFO + { + .procname = "accept_ra_rt_info_min_plen", + .data = &ipv6_devconf.accept_ra_rt_info_min_plen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "accept_ra_rt_info_max_plen", .data = &ipv6_devconf.accept_ra_rt_info_max_plen, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 84afb9a77278..3452f9037ad4 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1358,6 +1358,8 @@ skip_linkparms: if (ri->prefix_len == 0 && !in6_dev->cnf.accept_ra_defrtr) continue; + if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) + continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, From f0faedd6b468777f3bb5834f97100794d562c8b7 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Thu, 23 Mar 2017 13:51:24 -0700 Subject: [PATCH 0715/3220] fix the deadlock in xt_qtaguid when enable DDEBUG When DDEBUG is enabled, the prdebug_full_state() function will try to recursively aquire the spinlock of sock_tag_list and causing deadlock. A check statement is added before it aquire the spinlock to differentiate the behavior depend on the caller of the function. Bug: 36559739 Test: Compile and run test under system/extra/test/iptables/ Change-Id: Ie3397fbaa207e14fe214d47aaf5e8ca1f4a712ee Signed-off-by: Chenbo Feng --- net/netfilter/xt_qtaguid.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index 3bf0c59dab2f..0f5628a59917 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1814,8 +1814,11 @@ ret_res: } #ifdef DDEBUG -/* This function is not in xt_qtaguid_print.c because of locks visibility */ -static void prdebug_full_state(int indent_level, const char *fmt, ...) +/* + * This function is not in xt_qtaguid_print.c because of locks visibility. + * The lock of sock_tag_list must be aquired before calling this function + */ +static void prdebug_full_state_locked(int indent_level, const char *fmt, ...) { va_list args; char *fmt_buff; @@ -1836,16 +1839,12 @@ static void prdebug_full_state(int indent_level, const char *fmt, ...) kfree(buff); va_end(args); - spin_lock_bh(&sock_tag_list_lock); prdebug_sock_tag_tree(indent_level, &sock_tag_tree); - spin_unlock_bh(&sock_tag_list_lock); - spin_lock_bh(&sock_tag_list_lock); spin_lock_bh(&uid_tag_data_tree_lock); prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree); prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree); spin_unlock_bh(&uid_tag_data_tree_lock); - spin_unlock_bh(&sock_tag_list_lock); spin_lock_bh(&iface_stat_list_lock); prdebug_iface_stat_list(indent_level, &iface_stat_list); @@ -1854,7 +1853,7 @@ static void prdebug_full_state(int indent_level, const char *fmt, ...) pr_debug("qtaguid: %s(): }\n", __func__); } #else -static void prdebug_full_state(int indent_level, const char *fmt, ...) {} +static void prdebug_full_state_locked(int indent_level, const char *fmt, ...) {} #endif struct proc_ctrl_print_info { @@ -1977,8 +1976,11 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) (u64)atomic64_read(&qtu_events.match_no_sk), (u64)atomic64_read(&qtu_events.match_no_sk_file)); - /* Count the following as part of the last item_index */ - prdebug_full_state(0, "proc ctrl"); + /* Count the following as part of the last item_index. No need + * to lock the sock_tag_list here since it is already locked when + * starting the seq_file operation + */ + prdebug_full_state_locked(0, "proc ctrl"); } return 0; @@ -2887,8 +2889,10 @@ static int qtudev_release(struct inode *inode, struct file *file) sock_tag_tree_erase(&st_to_free_tree); - prdebug_full_state(0, "%s(): pid=%u tgid=%u", __func__, + spin_lock_bh(&sock_tag_list_lock); + prdebug_full_state_locked(0, "%s(): pid=%u tgid=%u", __func__, current->pid, current->tgid); + spin_unlock_bh(&sock_tag_list_lock); return 0; } From 93a520cc3cc2106bc8cb3354d68d75f5f9d26817 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 21 Mar 2017 16:28:27 -0700 Subject: [PATCH 0716/3220] ANDROID: sdcardfs: correct order of descriptors Signed-off-by: Daniel Rosenberg Bug: 35331000 Change-Id: Ia6d16b19c8c911f41231d2a12be0740057edfacf --- fs/sdcardfs/packagelist.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index e72fe83f7837..5398ebf8a34b 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -48,12 +48,14 @@ static struct kmem_cache *hashtable_entry_cachep; static unsigned int full_name_case_hash(const unsigned char *name, unsigned int len) { unsigned long hash = init_name_hash(); + while (len--) hash = partial_name_hash(tolower(*name++), hash); return end_name_hash(hash); } -static void inline qstr_init(struct qstr *q, const char *name) { +static inline void qstr_init(struct qstr *q, const char *name) +{ q->name = name; q->len = strlen(q->name); q->hash = full_name_case_hash(q->name, q->len); From dfdaa9584471358a3d3865ca7de972875dd18bdd Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 16 Mar 2017 17:42:58 -0700 Subject: [PATCH 0717/3220] ANDROID: sdcardfs: Fix formatting This fixes various spacing and bracket related issues pointed out by checkpatch. Signed-off-by: Daniel Rosenberg Bug: 35331000 Change-Id: I6e248833a7a04e3899f3ae9462d765cfcaa70c96 --- fs/sdcardfs/dentry.c | 13 +- fs/sdcardfs/derived_perm.c | 242 +++++++++++++++++++------------------ fs/sdcardfs/file.c | 5 +- fs/sdcardfs/inode.c | 50 ++++---- fs/sdcardfs/lookup.c | 32 ++--- fs/sdcardfs/main.c | 34 ++++-- fs/sdcardfs/packagelist.c | 43 ++++--- fs/sdcardfs/sdcardfs.h | 106 ++++++++-------- fs/sdcardfs/super.c | 29 +++-- 9 files changed, 300 insertions(+), 254 deletions(-) diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index 64494a50e250..aaa15dff6f51 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -46,7 +46,8 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags) spin_unlock(&dentry->d_lock); /* check uninitialized obb_dentry and - * whether the base obbpath has been changed or not */ + * whether the base obbpath has been changed or not + */ if (is_obbpath_invalid(dentry)) { d_drop(dentry); return 0; @@ -106,12 +107,10 @@ out: static void sdcardfs_d_release(struct dentry *dentry) { /* release and reset the lower paths */ - if(has_graft_path(dentry)) { + if (has_graft_path(dentry)) sdcardfs_put_reset_orig_path(dentry); - } sdcardfs_put_reset_lower_path(dentry); free_dentry_private_data(dentry); - return; } static int sdcardfs_hash_ci(const struct dentry *dentry, @@ -168,14 +167,16 @@ static int sdcardfs_cmp_ci(const struct dentry *parent, return 1; } -static void sdcardfs_canonical_path(const struct path *path, struct path *actual_path) { +static void sdcardfs_canonical_path(const struct path *path, + struct path *actual_path) +{ sdcardfs_get_real_lower(path->dentry, actual_path); } const struct dentry_operations sdcardfs_ci_dops = { .d_revalidate = sdcardfs_d_revalidate, .d_release = sdcardfs_d_release, - .d_hash = sdcardfs_hash_ci, + .d_hash = sdcardfs_hash_ci, .d_compare = sdcardfs_cmp_ci, .d_canonical_path = sdcardfs_canonical_path, }; diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 28e9b8d42f9e..51fa565742bd 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -37,7 +37,8 @@ static void inherit_derived_state(struct inode *parent, struct inode *child) /* helper function for derived state */ void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, - uid_t uid, bool under_android, struct inode *top) + uid_t uid, bool under_android, + struct inode *top) { struct sdcardfs_inode_info *info = SDCARDFS_I(inode); @@ -50,11 +51,14 @@ void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, set_top(info, top); } -/* While renaming, there is a point where we want the path from dentry, but the name from newdentry */ -void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name) +/* While renaming, there is a point where we want the path from dentry, + * but the name from newdentry + */ +void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, + const struct qstr *name) { struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry)); - struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent)); + struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent)); appid_t appid; struct qstr q_Android = QSTR_LITERAL("Android"); struct qstr q_data = QSTR_LITERAL("data"); @@ -77,58 +81,57 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, co return; /* Derive custom permissions based on parent and current node */ switch (parent_info->perm) { - case PERM_INHERIT: - case PERM_ANDROID_PACKAGE_CACHE: - /* Already inherited above */ - break; - case PERM_PRE_ROOT: - /* Legacy internal layout places users at top level */ - info->perm = PERM_ROOT; - info->userid = simple_strtoul(name->name, NULL, 10); + case PERM_INHERIT: + case PERM_ANDROID_PACKAGE_CACHE: + /* Already inherited above */ + break; + case PERM_PRE_ROOT: + /* Legacy internal layout places users at top level */ + info->perm = PERM_ROOT; + info->userid = simple_strtoul(name->name, NULL, 10); + set_top(info, &info->vfs_inode); + break; + case PERM_ROOT: + /* Assume masked off by default. */ + if (qstr_case_eq(name, &q_Android)) { + /* App-specific directories inside; let anyone traverse */ + info->perm = PERM_ANDROID; + info->under_android = true; set_top(info, &info->vfs_inode); - break; - case PERM_ROOT: - /* Assume masked off by default. */ - if (qstr_case_eq(name, &q_Android)) { - /* App-specific directories inside; let anyone traverse */ - info->perm = PERM_ANDROID; - info->under_android = true; - set_top(info, &info->vfs_inode); - } - break; - case PERM_ANDROID: - if (qstr_case_eq(name, &q_data)) { - /* App-specific directories inside; let anyone traverse */ - info->perm = PERM_ANDROID_DATA; - set_top(info, &info->vfs_inode); - } else if (qstr_case_eq(name, &q_obb)) { - /* App-specific directories inside; let anyone traverse */ - info->perm = PERM_ANDROID_OBB; - info->under_obb = true; - set_top(info, &info->vfs_inode); - /* Single OBB directory is always shared */ - } else if (qstr_case_eq(name, &q_media)) { - /* App-specific directories inside; let anyone traverse */ - info->perm = PERM_ANDROID_MEDIA; - set_top(info, &info->vfs_inode); - } - break; - case PERM_ANDROID_OBB: - case PERM_ANDROID_DATA: - case PERM_ANDROID_MEDIA: - info->perm = PERM_ANDROID_PACKAGE; - appid = get_appid(name->name); - if (appid != 0 && !is_excluded(name->name, parent_info->userid)) { - info->d_uid = multiuser_get_uid(parent_info->userid, appid); - } + } + break; + case PERM_ANDROID: + if (qstr_case_eq(name, &q_data)) { + /* App-specific directories inside; let anyone traverse */ + info->perm = PERM_ANDROID_DATA; set_top(info, &info->vfs_inode); - break; - case PERM_ANDROID_PACKAGE: - if (qstr_case_eq(name, &q_cache)) { - info->perm = PERM_ANDROID_PACKAGE_CACHE; - info->under_cache = true; - } - break; + } else if (qstr_case_eq(name, &q_obb)) { + /* App-specific directories inside; let anyone traverse */ + info->perm = PERM_ANDROID_OBB; + info->under_obb = true; + set_top(info, &info->vfs_inode); + /* Single OBB directory is always shared */ + } else if (qstr_case_eq(name, &q_media)) { + /* App-specific directories inside; let anyone traverse */ + info->perm = PERM_ANDROID_MEDIA; + set_top(info, &info->vfs_inode); + } + break; + case PERM_ANDROID_OBB: + case PERM_ANDROID_DATA: + case PERM_ANDROID_MEDIA: + info->perm = PERM_ANDROID_PACKAGE; + appid = get_appid(name->name); + if (appid != 0 && !is_excluded(name->name, parent_info->userid)) + info->d_uid = multiuser_get_uid(parent_info->userid, appid); + set_top(info, &info->vfs_inode); + break; + case PERM_ANDROID_PACKAGE: + if (qstr_case_eq(name, &q_cache)) { + info->perm = PERM_ANDROID_PACKAGE_CACHE; + info->under_cache = true; + } + break; } } @@ -137,7 +140,8 @@ void get_derived_permission(struct dentry *parent, struct dentry *dentry) get_derived_permission_new(parent, dentry, &dentry->d_name); } -static appid_t get_type(const char *name) { +static appid_t get_type(const char *name) +{ const char *ext = strrchr(name, '.'); appid_t id; @@ -149,7 +153,8 @@ static appid_t get_type(const char *name) { return AID_MEDIA_RW; } -void fixup_lower_ownership(struct dentry* dentry, const char *name) { +void fixup_lower_ownership(struct dentry *dentry, const char *name) +{ struct path path; struct inode *inode; struct inode *delegated_inode = NULL; @@ -175,49 +180,49 @@ void fixup_lower_ownership(struct dentry* dentry, const char *name) { } switch (perm) { - case PERM_ROOT: - case PERM_ANDROID: - case PERM_ANDROID_DATA: - case PERM_ANDROID_MEDIA: - case PERM_ANDROID_PACKAGE: - case PERM_ANDROID_PACKAGE_CACHE: - uid = multiuser_get_uid(info->userid, uid); - break; - case PERM_ANDROID_OBB: - uid = AID_MEDIA_OBB; - break; - case PERM_PRE_ROOT: - default: - break; + case PERM_ROOT: + case PERM_ANDROID: + case PERM_ANDROID_DATA: + case PERM_ANDROID_MEDIA: + case PERM_ANDROID_PACKAGE: + case PERM_ANDROID_PACKAGE_CACHE: + uid = multiuser_get_uid(info->userid, uid); + break; + case PERM_ANDROID_OBB: + uid = AID_MEDIA_OBB; + break; + case PERM_PRE_ROOT: + default: + break; } switch (perm) { - case PERM_ROOT: - case PERM_ANDROID: - case PERM_ANDROID_DATA: - case PERM_ANDROID_MEDIA: - if (S_ISDIR(d_inode(dentry)->i_mode)) - gid = multiuser_get_uid(info->userid, AID_MEDIA_RW); - else - gid = multiuser_get_uid(info->userid, get_type(name)); - break; - case PERM_ANDROID_OBB: - gid = AID_MEDIA_OBB; - break; - case PERM_ANDROID_PACKAGE: - if (info->d_uid != 0) - gid = multiuser_get_ext_gid(info->d_uid); - else - gid = multiuser_get_uid(info->userid, uid); - break; - case PERM_ANDROID_PACKAGE_CACHE: - if (info->d_uid != 0) - gid = multiuser_get_cache_gid(info->d_uid); - else - gid = multiuser_get_uid(info->userid, uid); - break; - case PERM_PRE_ROOT: - default: - break; + case PERM_ROOT: + case PERM_ANDROID: + case PERM_ANDROID_DATA: + case PERM_ANDROID_MEDIA: + if (S_ISDIR(d_inode(dentry)->i_mode)) + gid = multiuser_get_uid(info->userid, AID_MEDIA_RW); + else + gid = multiuser_get_uid(info->userid, get_type(name)); + break; + case PERM_ANDROID_OBB: + gid = AID_MEDIA_OBB; + break; + case PERM_ANDROID_PACKAGE: + if (info->d_uid != 0) + gid = multiuser_get_ext_gid(info->d_uid); + else + gid = multiuser_get_uid(info->userid, uid); + break; + case PERM_ANDROID_PACKAGE_CACHE: + if (info->d_uid != 0) + gid = multiuser_get_cache_gid(info->d_uid); + else + gid = multiuser_get_uid(info->userid, uid); + break; + case PERM_PRE_ROOT: + default: + break; } sdcardfs_get_lower_path(dentry, &path); @@ -246,7 +251,8 @@ retry_deleg: sdcardfs_put_lower_path(dentry, &path); } -static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit) { +static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit) +{ if (info->perm == PERM_ROOT) return (limit->flags & BY_USERID)?info->userid == limit->userid:1; if (info->perm == PERM_PRE_ROOT || info->perm == PERM_ANDROID) @@ -254,7 +260,8 @@ static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct li return 0; } -static int needs_fixup(perm_t perm) { +static int needs_fixup(perm_t perm) +{ if (perm == PERM_ANDROID_DATA || perm == PERM_ANDROID_OBB || perm == PERM_ANDROID_MEDIA) return 1; @@ -292,9 +299,9 @@ static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search * } spin_unlock(&child->d_lock); } - } else if (descendant_may_need_fixup(info, limit)) { + } else if (descendant_may_need_fixup(info, limit)) { list_for_each_entry(child, &dentry->d_subdirs, d_child) { - __fixup_perms_recursive(child, limit, depth + 1); + __fixup_perms_recursive(child, limit, depth + 1); } } spin_unlock(&dentry->d_lock); @@ -310,7 +317,7 @@ inline void update_derived_permission_lock(struct dentry *dentry) { struct dentry *parent; - if(!dentry || !d_inode(dentry)) { + if (!dentry || !d_inode(dentry)) { printk(KERN_ERR "sdcardfs: %s: invalid dentry\n", __func__); return; } @@ -318,11 +325,11 @@ inline void update_derived_permission_lock(struct dentry *dentry) * 1. need to check whether the dentry is updated or not * 2. remove the root dentry update */ - if(IS_ROOT(dentry)) { + if (IS_ROOT(dentry)) { //setup_default_pre_root_state(d_inode(dentry)); } else { parent = dget_parent(dentry); - if(parent) { + if (parent) { get_derived_permission(parent, dentry); dput(parent); } @@ -334,15 +341,15 @@ int need_graft_path(struct dentry *dentry) { int ret = 0; struct dentry *parent = dget_parent(dentry); - struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent)); + struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent)); struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); struct qstr obb = QSTR_LITERAL("obb"); - if(parent_info->perm == PERM_ANDROID && + if (parent_info->perm == PERM_ANDROID && qstr_case_eq(&dentry->d_name, &obb)) { /* /Android/obb is the base obbpath of DERIVED_UNIFIED */ - if(!(sbi->options.multiuser == false + if (!(sbi->options.multiuser == false && parent_info->userid == 0)) { ret = 1; } @@ -362,17 +369,18 @@ int is_obbpath_invalid(struct dentry *dent) /* check the base obbpath has been changed. * this routine can check an uninitialized obb dentry as well. - * regarding the uninitialized obb, refer to the sdcardfs_mkdir() */ + * regarding the uninitialized obb, refer to the sdcardfs_mkdir() + */ spin_lock(&di->lock); - if(di->orig_path.dentry) { - if(!di->lower_path.dentry) { + if (di->orig_path.dentry) { + if (!di->lower_path.dentry) { ret = 1; } else { path_get(&di->lower_path); //lower_parent = lock_parent(lower_path->dentry); path_buf = kmalloc(PATH_MAX, GFP_ATOMIC); - if(!path_buf) { + if (!path_buf) { ret = 1; printk(KERN_ERR "sdcardfs: fail to allocate path_buf in %s.\n", __func__); } else { @@ -399,13 +407,13 @@ int is_base_obbpath(struct dentry *dentry) { int ret = 0; struct dentry *parent = dget_parent(dentry); - struct sdcardfs_inode_info *parent_info= SDCARDFS_I(d_inode(parent)); + struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent)); struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); struct qstr q_obb = QSTR_LITERAL("obb"); spin_lock(&SDCARDFS_D(dentry)->lock); if (sbi->options.multiuser) { - if(parent_info->perm == PERM_PRE_ROOT && + if (parent_info->perm == PERM_PRE_ROOT && qstr_case_eq(&dentry->d_name, &q_obb)) { ret = 1; } @@ -420,7 +428,8 @@ int is_base_obbpath(struct dentry *dentry) /* The lower_path will be stored to the dentry's orig_path * and the base obbpath will be copyed to the lower_path variable. * if an error returned, there's no change in the lower_path - * returns: -ERRNO if error (0: no error) */ + * returns: -ERRNO if error (0: no error) + */ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path) { int err = 0; @@ -435,7 +444,7 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path) err = kern_path(sbi->obbpath_s, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &obbpath); - if(!err) { + if (!err) { /* the obbpath base has been found */ pathcpy(lower_path, &obbpath); } else { @@ -443,7 +452,8 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path) * setup the lower_path with its orig_path. * but, the current implementation just returns an error * because the sdcard daemon also regards this case as - * a lookup fail. */ + * a lookup fail. + */ printk(KERN_INFO "sdcardfs: the sbi->obbpath is not available\n"); } return err; diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 0592facef704..4c9726606c0f 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -216,7 +216,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file) goto out_err; } - if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { + if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { err = -EACCES; goto out_err; } @@ -248,9 +248,8 @@ static int sdcardfs_open(struct inode *inode, struct file *file) if (err) kfree(SDCARDFS_F(file)); - else { + else sdcardfs_copy_and_fix_attrs(inode, sdcardfs_lower_inode(inode)); - } out_revert_cred: REVERT_CRED(saved_cred); diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index f19f11ea19fc..ce187d865f29 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -23,10 +23,10 @@ #include /* Do not directly use this function. Use OVERRIDE_CRED() instead. */ -const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs_inode_info *info) +const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_inode_info *info) { - struct cred * cred; - const struct cred * old_cred; + struct cred *cred; + const struct cred *old_cred; uid_t uid; cred = prepare_creds(); @@ -46,9 +46,9 @@ const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs } /* Do not directly use this function, use REVERT_CRED() instead. */ -void revert_fsids(const struct cred * old_cred) +void revert_fsids(const struct cred *old_cred) { - const struct cred * cur_cred; + const struct cred *cur_cred; cur_cred = current->cred; revert_creds(old_cred); @@ -67,7 +67,7 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, struct fs_struct *saved_fs; struct fs_struct *copied_fs; - if(!check_caller_access_to_name(dir, &dentry->d_name)) { + if (!check_caller_access_to_name(dir, &dentry->d_name)) { err = -EACCES; goto out_eacces; } @@ -166,7 +166,7 @@ static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry) struct path lower_path; const struct cred *saved_cred = NULL; - if(!check_caller_access_to_name(dir, &dentry->d_name)) { + if (!check_caller_access_to_name(dir, &dentry->d_name)) { err = -EACCES; goto out_eacces; } @@ -240,13 +240,14 @@ out: } #endif -static int touch(char *abs_path, mode_t mode) { +static int touch(char *abs_path, mode_t mode) +{ struct file *filp = filp_open(abs_path, O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW, mode); + if (IS_ERR(filp)) { if (PTR_ERR(filp) == -EEXIST) { return 0; - } - else { + } else { printk(KERN_ERR "sdcardfs: failed to open(%s): %ld\n", abs_path, PTR_ERR(filp)); return PTR_ERR(filp); @@ -273,7 +274,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct qstr q_obb = QSTR_LITERAL("obb"); struct qstr q_data = QSTR_LITERAL("data"); - if(!check_caller_access_to_name(dir, &dentry->d_name)) { + if (!check_caller_access_to_name(dir, &dentry->d_name)) { err = -EACCES; goto out_eacces; } @@ -315,19 +316,21 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode } /* if it is a local obb dentry, setup it with the base obbpath */ - if(need_graft_path(dentry)) { + if (need_graft_path(dentry)) { err = setup_obb_dentry(dentry, &lower_path); - if(err) { + if (err) { /* if the sbi->obbpath is not available, the lower_path won't be * changed by setup_obb_dentry() but the lower path is saved to * its orig_path. this dentry will be revalidated later. - * but now, the lower_path should be NULL */ + * but now, the lower_path should be NULL + */ sdcardfs_put_reset_lower_path(dentry); /* the newly created lower path which saved to its orig_path or * the lower_path is the base obbpath. - * therefore, an additional path_get is required */ + * therefore, an additional path_get is required + */ path_get(&lower_path); } else make_nomedia_in_obb = 1; @@ -382,7 +385,7 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) struct path lower_path; const struct cred *saved_cred = NULL; - if(!check_caller_access_to_name(dir, &dentry->d_name)) { + if (!check_caller_access_to_name(dir, &dentry->d_name)) { err = -EACCES; goto out_eacces; } @@ -391,7 +394,8 @@ static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry) OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); /* sdcardfs_get_real_lower(): in case of remove an user's obb dentry - * the dentry on the original path should be deleted. */ + * the dentry on the original path should be deleted. + */ sdcardfs_get_real_lower(dentry, &lower_path); lower_dentry = lower_path.dentry; @@ -467,7 +471,7 @@ static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct path lower_old_path, lower_new_path; const struct cred *saved_cred = NULL; - if(!check_caller_access_to_name(old_dir, &old_dentry->d_name) || + if (!check_caller_access_to_name(old_dir, &old_dentry->d_name) || !check_caller_access_to_name(new_dir, &new_dentry->d_name)) { err = -EACCES; goto out_eacces; @@ -656,6 +660,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma * we check it with AID_MEDIA_RW permission */ struct inode *lower_inode; + OVERRIDE_CRED(SDCARDFS_SB(inode->sb)); lower_inode = sdcardfs_lower_inode(inode); @@ -724,7 +729,8 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct /* prepare our own lower struct iattr (with the lower file) */ memcpy(&lower_ia, ia, sizeof(lower_ia)); /* Allow touch updating timestamps. A previous permission check ensures - * we have write access. Changes to mode, owner, and group are ignored*/ + * we have write access. Changes to mode, owner, and group are ignored + */ ia->ia_valid |= ATTR_FORCE; err = inode_change_ok(&tmp, ia); @@ -810,10 +816,12 @@ out_err: return err; } -static int sdcardfs_fillattr(struct vfsmount *mnt, struct inode *inode, struct kstat *stat) +static int sdcardfs_fillattr(struct vfsmount *mnt, + struct inode *inode, struct kstat *stat) { struct sdcardfs_inode_info *info = SDCARDFS_I(inode); struct inode *top = grab_top(info); + if (!top) return -EINVAL; @@ -843,7 +851,7 @@ static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry, int err; parent = dget_parent(dentry); - if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { + if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { dput(parent); return -EACCES; } diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index fffb94c923c4..fab9fd9d0105 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -73,6 +73,7 @@ static int sdcardfs_inode_test(struct inode *inode, void *candidate_data/*void * { struct inode *current_lower_inode = sdcardfs_lower_inode(inode); userid_t current_userid = SDCARDFS_I(inode)->userid; + if (current_lower_inode == ((struct inode_data *)candidate_data)->lower_inode && current_userid == ((struct inode_data *)candidate_data)->id) return 1; /* found a match */ @@ -102,7 +103,7 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, u * instead. */ lower_inode->i_ino, /* hashval */ - sdcardfs_inode_test, /* inode comparison function */ + sdcardfs_inode_test, /* inode comparison function */ sdcardfs_inode_set, /* inode init function */ &data); /* data passed to test+set fxns */ if (!inode) { @@ -213,8 +214,8 @@ struct sdcardfs_name_data { bool found; }; -static int sdcardfs_name_match(struct dir_context *ctx, const char *name, int namelen, - loff_t offset, u64 ino, unsigned int d_type) +static int sdcardfs_name_match(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct sdcardfs_name_data *buf = container_of(ctx, struct sdcardfs_name_data, ctx); struct qstr candidate = QSTR_INIT(name, namelen); @@ -303,23 +304,26 @@ put_name: if (!err) { /* check if the dentry is an obb dentry * if true, the lower_inode must be replaced with - * the inode of the graft path */ + * the inode of the graft path + */ - if(need_graft_path(dentry)) { + if (need_graft_path(dentry)) { /* setup_obb_dentry() - * The lower_path will be stored to the dentry's orig_path + * The lower_path will be stored to the dentry's orig_path * and the base obbpath will be copyed to the lower_path variable. * if an error returned, there's no change in the lower_path - * returns: -ERRNO if error (0: no error) */ + * returns: -ERRNO if error (0: no error) + */ err = setup_obb_dentry(dentry, &lower_path); - if(err) { + if (err) { /* if the sbi->obbpath is not available, we can optionally * setup the lower_path with its orig_path. * but, the current implementation just returns an error * because the sdcard daemon also regards this case as - * a lookup fail. */ + * a lookup fail. + */ printk(KERN_INFO "sdcardfs: base obbpath is not available\n"); sdcardfs_put_reset_orig_path(dentry); goto out; @@ -374,9 +378,9 @@ out: /* * On success: - * fills dentry object appropriate values and returns NULL. + * fills dentry object appropriate values and returns NULL. * On fail (== error) - * returns error ptr + * returns error ptr * * @dir : Parent inode. It is locked (dir->i_mutex) * @dentry : Target dentry to lookup. we should set each of fields. @@ -393,10 +397,10 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, parent = dget_parent(dentry); - if(!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { + if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) { ret = ERR_PTR(-EACCES); goto out_err; - } + } /* save current_cred and override it */ OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir)); @@ -412,9 +416,7 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path, SDCARDFS_I(dir)->userid); if (IS_ERR(ret)) - { goto out; - } if (ret) dentry = ret; if (d_inode(dentry)) { diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 4e2aded8d1d9..c249bd73b121 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -72,6 +72,7 @@ static int parse_options(struct super_block *sb, char *options, int silent, while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; @@ -148,6 +149,7 @@ int parse_options_remount(struct super_block *sb, char *options, int silent, while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; @@ -223,8 +225,8 @@ static struct dentry *sdcardfs_d_alloc_root(struct super_block *sb) #endif DEFINE_MUTEX(sdcardfs_super_list_lock); -LIST_HEAD(sdcardfs_super_list); EXPORT_SYMBOL_GPL(sdcardfs_super_list_lock); +LIST_HEAD(sdcardfs_super_list); EXPORT_SYMBOL_GPL(sdcardfs_super_list); /* @@ -328,14 +330,15 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, /* setup permission policy */ sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL); mutex_lock(&sdcardfs_super_list_lock); - if(sb_info->options.multiuser) { - setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT, sb_info->options.fs_user_id, AID_ROOT, false, d_inode(sb->s_root)); + if (sb_info->options.multiuser) { + setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT, + sb_info->options.fs_user_id, AID_ROOT, + false, d_inode(sb->s_root)); snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name); - /*err = prepare_dir(sb_info->obbpath_s, - sb_info->options.fs_low_uid, - sb_info->options.fs_low_gid, 00755);*/ } else { - setup_derived_state(d_inode(sb->s_root), PERM_ROOT, sb_info->options.fs_user_id, AID_ROOT, false, d_inode(sb->s_root)); + setup_derived_state(d_inode(sb->s_root), PERM_ROOT, + sb_info->options.fs_user_id, AID_ROOT, + false, d_inode(sb->s_root)); snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name); } fixup_tmp_permissions(d_inode(sb->s_root)); @@ -368,8 +371,10 @@ out: /* A feature which supports mount_nodev() with options */ static struct dentry *mount_nodev_with_options(struct vfsmount *mnt, - struct file_system_type *fs_type, int flags, const char *dev_name, void *data, - int (*fill_super)(struct vfsmount *, struct super_block *, const char *, void *, int)) + struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, + int (*fill_super)(struct vfsmount *, struct super_block *, + const char *, void *, int)) { int error; @@ -401,19 +406,22 @@ static struct dentry *sdcardfs_mount(struct vfsmount *mnt, raw_data, sdcardfs_read_super); } -static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data) +static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) { WARN(1, "sdcardfs does not support mount. Use mount2.\n"); return ERR_PTR(-EINVAL); } -void *sdcardfs_alloc_mnt_data(void) { +void *sdcardfs_alloc_mnt_data(void) +{ return kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL); } -void sdcardfs_kill_sb(struct super_block *sb) { +void sdcardfs_kill_sb(struct super_block *sb) +{ struct sdcardfs_sb_info *sbi; + if (sb->s_magic == SDCARDFS_SUPER_MAGIC) { sbi = SDCARDFS_SB(sb); mutex_lock(&sdcardfs_super_list_lock); diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 5398ebf8a34b..58218fed4083 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -61,7 +61,8 @@ static inline void qstr_init(struct qstr *q, const char *name) q->hash = full_name_case_hash(q->name, q->len); } -static inline int qstr_copy(const struct qstr *src, struct qstr *dest) { +static inline int qstr_copy(const struct qstr *src, struct qstr *dest) +{ dest->name = kstrdup(src->name, GFP_KERNEL); dest->hash_len = src->hash_len; return !!dest->name; @@ -89,6 +90,7 @@ static appid_t __get_appid(const struct qstr *key) appid_t get_appid(const char *key) { struct qstr q; + qstr_init(&q, key); return __get_appid(&q); } @@ -114,6 +116,7 @@ static appid_t __get_ext_gid(const struct qstr *key) appid_t get_ext_gid(const char *key) { struct qstr q; + qstr_init(&q, key); return __get_ext_gid(&q); } @@ -144,8 +147,10 @@ appid_t is_excluded(const char *key, userid_t user) /* Kernel has already enforced everything we returned through * derive_permissions_locked(), so this is used to lock down access - * even further, such as enforcing that apps hold sdcard_rw. */ -int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name) { + * even further, such as enforcing that apps hold sdcard_rw. + */ +int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name) +{ struct qstr q_autorun = QSTR_LITERAL("autorun.inf"); struct qstr q__android_secure = QSTR_LITERAL(".android_secure"); struct qstr q_android_secure = QSTR_LITERAL("android_secure"); @@ -160,26 +165,26 @@ int check_caller_access_to_name(struct inode *parent_node, const struct qstr *na } /* Root always has access; access for any other UIDs should always - * be controlled through packages.list. */ - if (from_kuid(&init_user_ns, current_fsuid()) == 0) { + * be controlled through packages.list. + */ + if (from_kuid(&init_user_ns, current_fsuid()) == 0) return 1; - } /* No extra permissions to enforce */ return 1; } /* This function is used when file opening. The open flags must be - * checked before calling check_caller_access_to_name() */ -int open_flags_to_access_mode(int open_flags) { - if((open_flags & O_ACCMODE) == O_RDONLY) { + * checked before calling check_caller_access_to_name() + */ +int open_flags_to_access_mode(int open_flags) +{ + if ((open_flags & O_ACCMODE) == O_RDONLY) return 0; /* R_OK */ - } else if ((open_flags & O_ACCMODE) == O_WRONLY) { + if ((open_flags & O_ACCMODE) == O_WRONLY) return 1; /* W_OK */ - } else { - /* Probably O_RDRW, but treat as default to be safe */ + /* Probably O_RDRW, but treat as default to be safe */ return 1; /* R_OK | W_OK */ - } } static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key, @@ -371,7 +376,6 @@ static void remove_packagelist_entry(const struct qstr *key) remove_packagelist_entry_locked(key); fixup_all_perms_name(key); mutex_unlock(&sdcardfs_super_list_lock); - return; } static void remove_ext_gid_entry_locked(const struct qstr *key, gid_t group) @@ -394,7 +398,6 @@ static void remove_ext_gid_entry(const struct qstr *key, gid_t group) mutex_lock(&sdcardfs_super_list_lock); remove_ext_gid_entry_locked(key, group); mutex_unlock(&sdcardfs_super_list_lock); - return; } static void remove_userid_all_entry_locked(userid_t userid) @@ -422,7 +425,6 @@ static void remove_userid_all_entry(userid_t userid) remove_userid_all_entry_locked(userid); fixup_all_perms_userid(userid); mutex_unlock(&sdcardfs_super_list_lock); - return; } static void remove_userid_exclude_entry_locked(const struct qstr *key, userid_t userid) @@ -447,7 +449,6 @@ static void remove_userid_exclude_entry(const struct qstr *key, userid_t userid) remove_userid_exclude_entry_locked(key, userid); fixup_all_perms_name_userid(key, userid); mutex_unlock(&sdcardfs_super_list_lock); - return; } static void packagelist_destroy(void) @@ -456,6 +457,7 @@ static void packagelist_destroy(void) struct hlist_node *h_t; HLIST_HEAD(free_list); int i; + mutex_lock(&sdcardfs_super_list_lock); hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) { hash_del_rcu(&hash_cur->hlist); @@ -603,7 +605,7 @@ static struct configfs_attribute *package_details_attrs[] = { }; static struct configfs_item_operations package_details_item_ops = { - .release = package_details_release, + .release = package_details_release, }; static struct config_item_type package_appid_type = { @@ -659,6 +661,7 @@ static struct config_item *extension_details_make_item(struct config_group *grou struct extension_details *extension_details = kzalloc(sizeof(struct extension_details), GFP_KERNEL); const char *tmp; int ret; + if (!extension_details) return ERR_PTR(-ENOMEM); @@ -713,6 +716,7 @@ static struct config_group *extensions_make_group(struct config_group *group, co static void extensions_drop_group(struct config_group *group, struct config_item *item) { struct extensions_value *value = to_extensions_value(item); + printk(KERN_INFO "sdcardfs: No longer mapping any files to gid %d\n", value->num); kfree(value); } @@ -847,6 +851,7 @@ static int configfs_sdcardfs_init(void) { int ret, i; struct configfs_subsystem *subsys = &sdcardfs_packages; + for (i = 0; sd_default_groups[i]; i++) { config_group_init(sd_default_groups[i]); } @@ -877,7 +882,7 @@ int packagelist_init(void) } configfs_sdcardfs_init(); - return 0; + return 0; } void packagelist_exit(void) diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index e28a7dd47ebb..d1b86fff1b70 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -87,11 +87,11 @@ } while (0) /* OVERRIDE_CRED() and REVERT_CRED() - * OVERRID_CRED() - * backup original task->cred - * and modifies task->cred->fsuid/fsgid to specified value. + * OVERRIDE_CRED() + * backup original task->cred + * and modifies task->cred->fsuid/fsgid to specified value. * REVERT_CRED() - * restore original task->cred->fsuid/fsgid. + * restore original task->cred->fsuid/fsgid. * These two macro should be used in pair, and OVERRIDE_CRED() should be * placed at the beginning of a function, right after variable declaration. */ @@ -114,27 +114,29 @@ /* Android 5.0 support */ /* Permission mode for a specific node. Controls how file permissions - * are derived for children nodes. */ + * are derived for children nodes. + */ typedef enum { - /* Nothing special; this node should just inherit from its parent. */ - PERM_INHERIT, - /* This node is one level above a normal root; used for legacy layouts - * which use the first level to represent user_id. */ - PERM_PRE_ROOT, - /* This node is "/" */ - PERM_ROOT, - /* This node is "/Android" */ - PERM_ANDROID, - /* This node is "/Android/data" */ - PERM_ANDROID_DATA, - /* This node is "/Android/obb" */ - PERM_ANDROID_OBB, - /* This node is "/Android/media" */ - PERM_ANDROID_MEDIA, - /* This node is "/Android/[data|media|obb]/[package]" */ - PERM_ANDROID_PACKAGE, - /* This node is "/Android/[data|media|obb]/[package]/cache" */ - PERM_ANDROID_PACKAGE_CACHE, + /* Nothing special; this node should just inherit from its parent. */ + PERM_INHERIT, + /* This node is one level above a normal root; used for legacy layouts + * which use the first level to represent user_id. + */ + PERM_PRE_ROOT, + /* This node is "/" */ + PERM_ROOT, + /* This node is "/Android" */ + PERM_ANDROID, + /* This node is "/Android/data" */ + PERM_ANDROID_DATA, + /* This node is "/Android/obb" */ + PERM_ANDROID_OBB, + /* This node is "/Android/media" */ + PERM_ANDROID_MEDIA, + /* This node is "/Android/[data|media|obb]/[package]" */ + PERM_ANDROID_PACKAGE, + /* This node is "/Android/[data|media|obb]/[package]/cache" */ + PERM_ANDROID_PACKAGE_CACHE, } perm_t; struct sdcardfs_sb_info; @@ -142,9 +144,9 @@ struct sdcardfs_mount_options; struct sdcardfs_inode_info; /* Do not directly use this function. Use OVERRIDE_CRED() instead. */ -const struct cred * override_fsids(struct sdcardfs_sb_info* sbi, struct sdcardfs_inode_info *info); +const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_inode_info *info); /* Do not directly use this function, use REVERT_CRED() instead. */ -void revert_fsids(const struct cred * old_cred); +void revert_fsids(const struct cred *old_cred); /* operations vectors defined in specific files */ extern const struct file_operations sdcardfs_main_fops; @@ -221,7 +223,8 @@ struct sdcardfs_sb_info { struct super_block *sb; struct super_block *lower_sb; /* derived perm policy : some of options have been added - * to sdcardfs_mount_options (Android 4.4 support) */ + * to sdcardfs_mount_options (Android 4.4 support) + */ struct sdcardfs_mount_options options; spinlock_t lock; /* protects obbpath */ char *obbpath_s; @@ -332,7 +335,7 @@ static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \ { \ struct path pname; \ spin_lock(&SDCARDFS_D(dent)->lock); \ - if(SDCARDFS_D(dent)->pname.dentry) { \ + if (SDCARDFS_D(dent)->pname.dentry) { \ pathcpy(&pname, &SDCARDFS_D(dent)->pname); \ SDCARDFS_D(dent)->pname.dentry = NULL; \ SDCARDFS_D(dent)->pname.mnt = NULL; \ @@ -348,17 +351,17 @@ SDCARDFS_DENT_FUNC(orig_path) static inline bool sbinfo_has_sdcard_magic(struct sdcardfs_sb_info *sbinfo) { - return sbinfo && sbinfo->sb && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC; + return sbinfo && sbinfo->sb && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC; } /* grab a refererence if we aren't linking to ourself */ static inline void set_top(struct sdcardfs_inode_info *info, struct inode *top) { struct inode *old_top = NULL; + BUG_ON(IS_ERR_OR_NULL(top)); - if (info->top && info->top != &info->vfs_inode) { + if (info->top && info->top != &info->vfs_inode) old_top = info->top; - } if (top != &info->vfs_inode) igrab(top); info->top = top; @@ -368,11 +371,11 @@ static inline void set_top(struct sdcardfs_inode_info *info, struct inode *top) static inline struct inode *grab_top(struct sdcardfs_inode_info *info) { struct inode *top = info->top; - if (top) { + + if (top) return igrab(top); - } else { + else return NULL; - } } static inline void release_top(struct sdcardfs_inode_info *info) @@ -380,21 +383,24 @@ static inline void release_top(struct sdcardfs_inode_info *info) iput(info->top); } -static inline int get_gid(struct vfsmount *mnt, struct sdcardfs_inode_info *info) { +static inline int get_gid(struct vfsmount *mnt, struct sdcardfs_inode_info *info) +{ struct sdcardfs_vfsmount_options *opts = mnt->data; - if (opts->gid == AID_SDCARD_RW) { + if (opts->gid == AID_SDCARD_RW) /* As an optimization, certain trusted system components only run * as owner but operate across all users. Since we're now handing * out the sdcard_rw GID only to trusted apps, we're okay relaxing * the user boundary enforcement for the default view. The UIDs - * assigned to app directories are still multiuser aware. */ + * assigned to app directories are still multiuser aware. + */ return AID_SDCARD_RW; - } else { + else return multiuser_get_uid(info->userid, opts->gid); - } } -static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *info) { + +static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *info) +{ int owner_mode; int filtered_mode; struct sdcardfs_vfsmount_options *opts = mnt->data; @@ -403,17 +409,18 @@ static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *inf if (info->perm == PERM_PRE_ROOT) { /* Top of multi-user view should always be visible to ensure - * secondary users can traverse inside. */ + * secondary users can traverse inside. + */ visible_mode = 0711; } else if (info->under_android) { /* Block "other" access to Android directories, since only apps * belonging to a specific user should be in there; we still - * leave +x open for the default view. */ - if (opts->gid == AID_SDCARD_RW) { + * leave +x open for the default view. + */ + if (opts->gid == AID_SDCARD_RW) visible_mode = visible_mode & ~0006; - } else { + else visible_mode = visible_mode & ~0007; - } } owner_mode = info->lower_inode->i_mode & 0700; filtered_mode = visible_mode & (owner_mode | (owner_mode >> 3) | (owner_mode >> 6)); @@ -438,7 +445,7 @@ static inline void sdcardfs_get_real_lower(const struct dentry *dent, /* in case of a local obb dentry * the orig_path should be returned */ - if(has_graft_path(dent)) + if (has_graft_path(dent)) sdcardfs_get_orig_path(dent, real_lower); else sdcardfs_get_lower_path(dent, real_lower); @@ -447,7 +454,7 @@ static inline void sdcardfs_get_real_lower(const struct dentry *dent, static inline void sdcardfs_put_real_lower(const struct dentry *dent, struct path *real_lower) { - if(has_graft_path(dent)) + if (has_graft_path(dent)) sdcardfs_put_orig_path(dent, real_lower); else sdcardfs_put_lower_path(dent, real_lower); @@ -460,7 +467,7 @@ extern struct list_head sdcardfs_super_list; extern appid_t get_appid(const char *app_name); extern appid_t get_ext_gid(const char *app_name); extern appid_t is_excluded(const char *app_name, userid_t userid); -extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr* name); +extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name); extern int open_flags_to_access_mode(int open_flags); extern int packagelist_init(void); extern void packagelist_exit(void); @@ -481,7 +488,7 @@ extern void get_derived_permission_new(struct dentry *parent, struct dentry *den extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit); extern void update_derived_permission_lock(struct dentry *dentry); -void fixup_lower_ownership(struct dentry* dentry, const char *name); +void fixup_lower_ownership(struct dentry *dentry, const char *name); extern int need_graft_path(struct dentry *dentry); extern int is_base_obbpath(struct dentry *dentry); extern int is_obbpath_invalid(struct dentry *dentry); @@ -491,6 +498,7 @@ extern int setup_obb_dentry(struct dentry *dentry, struct path *lower_path); static inline struct dentry *lock_parent(struct dentry *dentry) { struct dentry *dir = dget_parent(dentry); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); return dir; } diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index edda32b68dc0..a4629f20812e 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -36,7 +36,7 @@ static void sdcardfs_put_super(struct super_block *sb) if (!spd) return; - if(spd->obbpath_s) { + if (spd->obbpath_s) { kfree(spd->obbpath_s); path_put(&spd->obbpath); } @@ -125,29 +125,33 @@ static int sdcardfs_remount_fs2(struct vfsmount *mnt, struct super_block *sb, * SILENT, but anything else left over is an error. */ if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT | MS_REMOUNT)) != 0) { - printk(KERN_ERR - "sdcardfs: remount flags 0x%x unsupported\n", *flags); + pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags); err = -EINVAL; } - printk(KERN_INFO "Remount options were %s for vfsmnt %p.\n", options, mnt); + pr_info("Remount options were %s for vfsmnt %p.\n", options, mnt); err = parse_options_remount(sb, options, *flags & ~MS_SILENT, mnt->data); return err; } -static void* sdcardfs_clone_mnt_data(void *data) { - struct sdcardfs_vfsmount_options* opt = kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL); - struct sdcardfs_vfsmount_options* old = data; - if(!opt) return NULL; +static void *sdcardfs_clone_mnt_data(void *data) +{ + struct sdcardfs_vfsmount_options *opt = kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL); + struct sdcardfs_vfsmount_options *old = data; + + if (!opt) + return NULL; opt->gid = old->gid; opt->mask = old->mask; return opt; } -static void sdcardfs_copy_mnt_data(void *data, void *newdata) { - struct sdcardfs_vfsmount_options* old = data; - struct sdcardfs_vfsmount_options* new = newdata; +static void sdcardfs_copy_mnt_data(void *data, void *newdata) +{ + struct sdcardfs_vfsmount_options *old = data; + struct sdcardfs_vfsmount_options *new = newdata; + old->gid = new->gid; old->mask = new->mask; } @@ -235,7 +239,8 @@ static void sdcardfs_umount_begin(struct super_block *sb) lower_sb->s_op->umount_begin(lower_sb); } -static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m, struct dentry *root) +static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m, + struct dentry *root) { struct sdcardfs_sb_info *sbi = SDCARDFS_SB(root->d_sb); struct sdcardfs_mount_options *opts = &sbi->options; From 9eb0b8ba1c9248d3e76f170265be0162026ea5c2 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 16 Mar 2017 19:33:35 -0700 Subject: [PATCH 0718/3220] ANDROID: sdcardfs: Fix style issues with comments Signed-off-by: Daniel Rosenberg Bug: 35331000 Change-Id: I8791ef7eac527645ecb9407908e7e5ece35b8f80 --- fs/sdcardfs/dentry.c | 16 +--------------- fs/sdcardfs/derived_perm.c | 9 +++------ fs/sdcardfs/main.c | 2 +- 3 files changed, 5 insertions(+), 22 deletions(-) diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index aaa15dff6f51..8e31d1a80f0c 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -127,12 +127,10 @@ static int sdcardfs_hash_ci(const struct dentry *dentry, unsigned long hash; name = qstr->name; - //len = vfat_striptail_len(qstr); len = qstr->len; hash = init_name_hash(); while (len--) - //hash = partial_name_hash(nls_tolower(t, *name++), hash); hash = partial_name_hash(tolower(*name++), hash); qstr->hash = end_name_hash(hash); @@ -146,20 +144,8 @@ static int sdcardfs_cmp_ci(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - /* This function is copy of vfat_cmpi */ - // FIXME Should we support national language? - //struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; - //unsigned int alen, blen; + /* FIXME Should we support national language? */ - /* A filename cannot end in '.' or we treat it like it has none */ - /* - alen = vfat_striptail_len(name); - blen = __vfat_striptail_len(len, str); - if (alen == blen) { - if (nls_strnicmp(t, name->name, str, alen) == 0) - return 0; - } - */ if (name->len == len) { if (str_n_case_eq(name->name, str, len)) return 0; diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 51fa565742bd..748eb3741b3b 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -325,9 +325,7 @@ inline void update_derived_permission_lock(struct dentry *dentry) * 1. need to check whether the dentry is updated or not * 2. remove the root dentry update */ - if (IS_ROOT(dentry)) { - //setup_default_pre_root_state(d_inode(dentry)); - } else { + if (!IS_ROOT(dentry)) { parent = dget_parent(dentry); if (parent) { get_derived_permission(parent, dentry); @@ -377,7 +375,6 @@ int is_obbpath_invalid(struct dentry *dent) ret = 1; } else { path_get(&di->lower_path); - //lower_parent = lock_parent(lower_path->dentry); path_buf = kmalloc(PATH_MAX, GFP_ATOMIC); if (!path_buf) { @@ -392,7 +389,6 @@ int is_obbpath_invalid(struct dentry *dent) kfree(path_buf); } - //unlock_dir(lower_parent); pathcpy(&lower_path, &di->lower_path); need_put = 1; } @@ -438,7 +434,8 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path) /* A local obb dentry must have its own orig_path to support rmdir * and mkdir of itself. Usually, we expect that the sbi->obbpath - * is avaiable on this stage. */ + * is avaiable on this stage. + */ sdcardfs_set_orig_path(dentry, lower_path); err = kern_path(sbi->obbpath_s, diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index c249bd73b121..95cf03379019 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -29,7 +29,7 @@ enum { Opt_gid, Opt_debug, Opt_mask, - Opt_multiuser, // May need? + Opt_multiuser, Opt_userid, Opt_reserved_mb, Opt_err, From 1ed9910bedd662fe74d213b6e4333d1f08dfdb90 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 21 Mar 2017 16:29:13 -0700 Subject: [PATCH 0719/3220] ANDROID: sdcardfs: remove unneeded null check As pointed out by checkpatch, these functions already handle null inputs, so the checks are not needed. Signed-off-by: Daniel Rosenberg Bug: 35331000 Change-Id: I189342f032dfcefee36b27648bb512488ad61d20 --- fs/sdcardfs/lookup.c | 3 +-- fs/sdcardfs/packagelist.c | 3 +-- fs/sdcardfs/super.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index fab9fd9d0105..51a904e59f46 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -36,8 +36,7 @@ int sdcardfs_init_dentry_cache(void) void sdcardfs_destroy_dentry_cache(void) { - if (sdcardfs_dentry_cachep) - kmem_cache_destroy(sdcardfs_dentry_cachep); + kmem_cache_destroy(sdcardfs_dentry_cachep); } void free_dentry_private_data(struct dentry *dentry) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 58218fed4083..3c016cb0d8bb 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -889,6 +889,5 @@ void packagelist_exit(void) { configfs_sdcardfs_exit(); packagelist_destroy(); - if (hashtable_entry_cachep) - kmem_cache_destroy(hashtable_entry_cachep); + kmem_cache_destroy(hashtable_entry_cachep); } diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index a4629f20812e..7d3c331379f5 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -222,8 +222,7 @@ int sdcardfs_init_inode_cache(void) /* sdcardfs inode cache destructor */ void sdcardfs_destroy_inode_cache(void) { - if (sdcardfs_inode_cachep) - kmem_cache_destroy(sdcardfs_inode_cachep); + kmem_cache_destroy(sdcardfs_inode_cachep); } /* From b7dbda19b8cebde338a9034d4706519c93c25a39 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 16 Mar 2017 17:46:13 -0700 Subject: [PATCH 0720/3220] ANDROID: sdcardfs: Use pr_[...] instead of printk Signed-off-by: Daniel Rosenberg Bug: 35331000 Change-Id: Ibc635ec865750530d32b87067779f681fe58a003 --- fs/sdcardfs/derived_perm.c | 6 ++--- fs/sdcardfs/file.c | 9 ++++---- fs/sdcardfs/inode.c | 8 +++---- fs/sdcardfs/lookup.c | 2 +- fs/sdcardfs/main.c | 45 +++++++++++++++++--------------------- fs/sdcardfs/packagelist.c | 16 +++++++------- fs/sdcardfs/sdcardfs.h | 2 +- fs/sdcardfs/super.c | 5 ++--- 8 files changed, 43 insertions(+), 50 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 748eb3741b3b..eabdbfd30054 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -318,7 +318,7 @@ inline void update_derived_permission_lock(struct dentry *dentry) struct dentry *parent; if (!dentry || !d_inode(dentry)) { - printk(KERN_ERR "sdcardfs: %s: invalid dentry\n", __func__); + pr_err("sdcardfs: %s: invalid dentry\n", __func__); return; } /* FIXME: @@ -379,7 +379,7 @@ int is_obbpath_invalid(struct dentry *dent) path_buf = kmalloc(PATH_MAX, GFP_ATOMIC); if (!path_buf) { ret = 1; - printk(KERN_ERR "sdcardfs: fail to allocate path_buf in %s.\n", __func__); + pr_err("sdcardfs: fail to allocate path_buf in %s.\n", __func__); } else { obbpath_s = d_path(&di->lower_path, path_buf, PATH_MAX); if (d_unhashed(di->lower_path.dentry) || @@ -451,7 +451,7 @@ int setup_obb_dentry(struct dentry *dentry, struct path *lower_path) * because the sdcard daemon also regards this case as * a lookup fail. */ - printk(KERN_INFO "sdcardfs: the sbi->obbpath is not available\n"); + pr_info("sdcardfs: the sbi->obbpath is not available\n"); } return err; } diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 4c9726606c0f..eee4eb5896e5 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -65,7 +65,7 @@ static ssize_t sdcardfs_write(struct file *file, const char __user *buf, /* check disk space */ if (!check_min_free_space(dentry, count, 0)) { - printk(KERN_INFO "No minimum free space.\n"); + pr_err("No minimum free space.\n"); return -ENOSPC; } @@ -160,8 +160,7 @@ static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma) lower_file = sdcardfs_lower_file(file); if (willwrite && !lower_file->f_mapping->a_ops->writepage) { err = -EINVAL; - printk(KERN_ERR "sdcardfs: lower file system does not " - "support writeable mmap\n"); + pr_err("sdcardfs: lower file system does not support writeable mmap\n"); goto out; } @@ -173,14 +172,14 @@ static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma) if (!SDCARDFS_F(file)->lower_vm_ops) { err = lower_file->f_op->mmap(lower_file, vma); if (err) { - printk(KERN_ERR "sdcardfs: lower mmap failed %d\n", err); + pr_err("sdcardfs: lower mmap failed %d\n", err); goto out; } saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */ err = do_munmap(current->mm, vma->vm_start, vma->vm_end - vma->vm_start); if (err) { - printk(KERN_ERR "sdcardfs: do_munmap failed %d\n", err); + pr_err("sdcardfs: do_munmap failed %d\n", err); goto out; } } diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index ce187d865f29..f15cb11ca8fd 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -248,7 +248,7 @@ static int touch(char *abs_path, mode_t mode) if (PTR_ERR(filp) == -EEXIST) { return 0; } else { - printk(KERN_ERR "sdcardfs: failed to open(%s): %ld\n", + pr_err("sdcardfs: failed to open(%s): %ld\n", abs_path, PTR_ERR(filp)); return PTR_ERR(filp); } @@ -284,7 +284,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode /* check disk space */ if (!check_min_free_space(dentry, 0, 1)) { - printk(KERN_INFO "sdcardfs: No minimum free space.\n"); + pr_err("sdcardfs: No minimum free space.\n"); err = -ENOSPC; goto out_revert; } @@ -360,7 +360,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode set_fs_pwd(current->fs, &lower_path); touch_err = touch(".nomedia", 0664); if (touch_err) { - printk(KERN_ERR "sdcardfs: failed to create .nomedia in %s: %d\n", + pr_err("sdcardfs: failed to create .nomedia in %s: %d\n", lower_path.dentry->d_name.name, touch_err); goto out; } @@ -642,7 +642,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma release_top(SDCARDFS_I(inode)); tmp.i_sb = inode->i_sb; if (IS_POSIXACL(inode)) - printk(KERN_WARNING "%s: This may be undefined behavior... \n", __func__); + pr_warn("%s: This may be undefined behavior...\n", __func__); err = generic_permission(&tmp, mask); /* XXX * Original sdcardfs code calls inode_permission(lower_inode,.. ) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 51a904e59f46..cfa2b12b6411 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -323,7 +323,7 @@ put_name: * because the sdcard daemon also regards this case as * a lookup fail. */ - printk(KERN_INFO "sdcardfs: base obbpath is not available\n"); + pr_info("sdcardfs: base obbpath is not available\n"); sdcardfs_put_reset_orig_path(dentry); goto out; } diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 95cf03379019..7344635522cd 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -117,19 +117,17 @@ static int parse_options(struct super_block *sb, char *options, int silent, break; /* unknown option */ default: - if (!silent) { - printk( KERN_ERR "Unrecognized mount option \"%s\" " - "or missing value", p); - } + if (!silent) + pr_err("Unrecognized mount option \"%s\" or missing value", p); return -EINVAL; } } if (*debug) { - printk( KERN_INFO "sdcardfs : options - debug:%d\n", *debug); - printk( KERN_INFO "sdcardfs : options - uid:%d\n", + pr_info("sdcardfs : options - debug:%d\n", *debug); + pr_info("sdcardfs : options - uid:%d\n", opts->fs_low_uid); - printk( KERN_INFO "sdcardfs : options - gid:%d\n", + pr_info("sdcardfs : options - gid:%d\n", opts->fs_low_gid); } @@ -175,22 +173,20 @@ int parse_options_remount(struct super_block *sb, char *options, int silent, case Opt_fsuid: case Opt_fsgid: case Opt_reserved_mb: - printk( KERN_WARNING "Option \"%s\" can't be changed during remount\n", p); + pr_warn("Option \"%s\" can't be changed during remount\n", p); break; /* unknown option */ default: - if (!silent) { - printk( KERN_ERR "Unrecognized mount option \"%s\" " - "or missing value", p); - } + if (!silent) + pr_err("Unrecognized mount option \"%s\" or missing value", p); return -EINVAL; } } if (debug) { - printk( KERN_INFO "sdcardfs : options - debug:%d\n", debug); - printk( KERN_INFO "sdcardfs : options - gid:%d\n", vfsopts->gid); - printk( KERN_INFO "sdcardfs : options - mask:%d\n", vfsopts->mask); + pr_info("sdcardfs : options - debug:%d\n", debug); + pr_info("sdcardfs : options - gid:%d\n", vfsopts->gid); + pr_info("sdcardfs : options - mask:%d\n", vfsopts->mask); } return 0; @@ -244,31 +240,30 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, struct sdcardfs_vfsmount_options *mnt_opt = mnt->data; struct inode *inode; - printk(KERN_INFO "sdcardfs version 2.0\n"); + pr_info("sdcardfs version 2.0\n"); if (!dev_name) { - printk(KERN_ERR - "sdcardfs: read_super: missing dev_name argument\n"); + pr_err("sdcardfs: read_super: missing dev_name argument\n"); err = -EINVAL; goto out; } - printk(KERN_INFO "sdcardfs: dev_name -> %s\n", dev_name); - printk(KERN_INFO "sdcardfs: options -> %s\n", (char *)raw_data); - printk(KERN_INFO "sdcardfs: mnt -> %p\n", mnt); + pr_info("sdcardfs: dev_name -> %s\n", dev_name); + pr_info("sdcardfs: options -> %s\n", (char *)raw_data); + pr_info("sdcardfs: mnt -> %p\n", mnt); /* parse lower path */ err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &lower_path); if (err) { - printk(KERN_ERR "sdcardfs: error accessing lower directory '%s'\n", dev_name); + pr_err("sdcardfs: error accessing lower directory '%s'\n", dev_name); goto out; } /* allocate superblock private data */ sb->s_fs_info = kzalloc(sizeof(struct sdcardfs_sb_info), GFP_KERNEL); if (!SDCARDFS_SB(sb)) { - printk(KERN_CRIT "sdcardfs: read_super: out of memory\n"); + pr_crit("sdcardfs: read_super: out of memory\n"); err = -ENOMEM; goto out_free; } @@ -277,7 +272,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, /* parse options */ err = parse_options(sb, raw_data, silent, &debug, mnt_opt, &sb_info->options); if (err) { - printk(KERN_ERR "sdcardfs: invalid options\n"); + pr_err("sdcardfs: invalid options\n"); goto out_freesbi; } @@ -347,7 +342,7 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, mutex_unlock(&sdcardfs_super_list_lock); if (!silent) - printk(KERN_INFO "sdcardfs: mounted on top of %s type %s\n", + pr_info("sdcardfs: mounted on top of %s type %s\n", dev_name, lower_sb->s_type->name); goto out; /* all is well */ diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 3c016cb0d8bb..89196e31073e 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -471,7 +471,7 @@ static void packagelist_destroy(void) hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) free_hashtable_entry(hash_cur); mutex_unlock(&sdcardfs_super_list_lock); - printk(KERN_INFO "sdcardfs: destroyed packagelist pkgld\n"); + pr_info("sdcardfs: destroyed packagelist pkgld\n"); } #define SDCARDFS_CONFIGFS_ATTR(_pfx, _name) \ @@ -587,7 +587,8 @@ static ssize_t package_details_clear_userid_store(struct config_item *item, static void package_details_release(struct config_item *item) { struct package_details *package_details = to_package_details(item); - printk(KERN_INFO "sdcardfs: removing %s\n", package_details->name.name); + + pr_info("sdcardfs: removing %s\n", package_details->name.name); remove_packagelist_entry(&package_details->name); kfree(package_details->name.name); kfree(package_details); @@ -639,7 +640,7 @@ static void extension_details_release(struct config_item *item) { struct extension_details *extension_details = to_extension_details(item); - printk(KERN_INFO "sdcardfs: No longer mapping %s files to gid %d\n", + pr_info("sdcardfs: No longer mapping %s files to gid %d\n", extension_details->name.name, extension_details->num); remove_ext_gid_entry(&extension_details->name, extension_details->num); kfree(extension_details->name.name); @@ -717,7 +718,7 @@ static void extensions_drop_group(struct config_group *group, struct config_item { struct extensions_value *value = to_extensions_value(item); - printk(KERN_INFO "sdcardfs: No longer mapping any files to gid %d\n", value->num); + pr_info("sdcardfs: No longer mapping any files to gid %d\n", value->num); kfree(value); } @@ -852,14 +853,13 @@ static int configfs_sdcardfs_init(void) int ret, i; struct configfs_subsystem *subsys = &sdcardfs_packages; - for (i = 0; sd_default_groups[i]; i++) { + for (i = 0; sd_default_groups[i]; i++) config_group_init(sd_default_groups[i]); - } config_group_init(&subsys->su_group); mutex_init(&subsys->su_mutex); ret = configfs_register_subsystem(subsys); if (ret) { - printk(KERN_ERR "Error %d while registering subsystem %s\n", + pr_err("Error %d while registering subsystem %s\n", ret, subsys->su_group.cg_item.ci_namebuf); } @@ -877,7 +877,7 @@ int packagelist_init(void) kmem_cache_create("packagelist_hashtable_entry", sizeof(struct hashtable_entry), 0, 0, NULL); if (!hashtable_entry_cachep) { - printk(KERN_ERR "sdcardfs: failed creating pkgl_hashtable entry slab cache\n"); + pr_err("sdcardfs: failed creating pkgl_hashtable entry slab cache\n"); return -ENOMEM; } diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index d1b86fff1b70..a9c5a8345d09 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -53,7 +53,7 @@ #define SDCARDFS_ROOT_INO 1 /* useful for tracking code reachability */ -#define UDBG printk(KERN_DEFAULT "DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__) +#define UDBG pr_default("DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__) #define SDCARDFS_DIRENT_SIZE 256 diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index 7d3c331379f5..67b4ae24337a 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -64,7 +64,7 @@ static int sdcardfs_statfs(struct dentry *dentry, struct kstatfs *buf) if (sbi->options.reserved_mb) { /* Invalid statfs informations. */ if (buf->f_bsize == 0) { - printk(KERN_ERR "Returned block size is zero.\n"); + pr_err("Returned block size is zero.\n"); return -EINVAL; } @@ -100,8 +100,7 @@ static int sdcardfs_remount_fs(struct super_block *sb, int *flags, char *options * SILENT, but anything else left over is an error. */ if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT)) != 0) { - printk(KERN_ERR - "sdcardfs: remount flags 0x%x unsupported\n", *flags); + pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags); err = -EINVAL; } From a9a3f48222dbc841998989c8b363e86d21b1379b Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 16 Mar 2017 19:32:59 -0700 Subject: [PATCH 0721/3220] ANDROID: sdcardfs: Use to kstrout Switch from deprecated simple_strtoul to kstrout Signed-off-by: Daniel Rosenberg Bug: 35331000 Change-Id: If18bd133b4d2877f71e58b58fc31371ff6613ed5 --- fs/sdcardfs/derived_perm.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index eabdbfd30054..f82fedd29007 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -60,6 +60,8 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry)); struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent)); appid_t appid; + unsigned long user_num; + int err; struct qstr q_Android = QSTR_LITERAL("Android"); struct qstr q_data = QSTR_LITERAL("data"); struct qstr q_obb = QSTR_LITERAL("obb"); @@ -88,7 +90,11 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, case PERM_PRE_ROOT: /* Legacy internal layout places users at top level */ info->perm = PERM_ROOT; - info->userid = simple_strtoul(name->name, NULL, 10); + err = kstrtoul(name->name, 10, &user_num); + if (err) + info->userid = 0; + else + info->userid = user_num; set_top(info, &info->vfs_inode); break; case PERM_ROOT: From 2b37dac9abaa183633ee933517c8c6e9d8cc3477 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 21 Mar 2017 17:27:40 -0700 Subject: [PATCH 0722/3220] ANDROID: sdcardfs: Use seq_puts over seq_printf Signed-off-by: Daniel Rosenberg Bug: 35331000 Change-Id: I3795ec61ce61e324738815b1ce3b0e09b25d723f --- fs/sdcardfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index 67b4ae24337a..a3393e959c63 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -251,7 +251,7 @@ static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m, if (vfsopts->gid != 0) seq_printf(m, ",gid=%u", vfsopts->gid); if (opts->multiuser) - seq_printf(m, ",multiuser"); + seq_puts(m, ",multiuser"); if (vfsopts->mask) seq_printf(m, ",mask=%u", vfsopts->mask); if (opts->fs_user_id) From ff9fa56a43ea617f6d0e028a41cc79760bd4dcff Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 21 Mar 2017 19:11:38 -0700 Subject: [PATCH 0723/3220] ANDROID: sdcardfs: Fix style issues in macros Signed-off-by: Daniel Rosenberg Bug: 35331000 Change-Id: I89c4035029dc2236081a7685c55cac595d9e7ebf --- fs/sdcardfs/sdcardfs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index a9c5a8345d09..2b67b9a8ef9f 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -96,21 +96,21 @@ * placed at the beginning of a function, right after variable declaration. */ #define OVERRIDE_CRED(sdcardfs_sbi, saved_cred, info) \ + do { \ saved_cred = override_fsids(sdcardfs_sbi, info); \ - if (!saved_cred) { return -ENOMEM; } + if (!saved_cred) \ + return -ENOMEM; \ + } while (0) #define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred, info) \ + do { \ saved_cred = override_fsids(sdcardfs_sbi, info); \ - if (!saved_cred) { return ERR_PTR(-ENOMEM); } + if (!saved_cred) \ + return ERR_PTR(-ENOMEM); \ + } while (0) #define REVERT_CRED(saved_cred) revert_fsids(saved_cred) -#define DEBUG_CRED() \ - printk("KAKJAGI: %s:%d fsuid %d fsgid %d\n", \ - __FUNCTION__, __LINE__, \ - (int)current->cred->fsuid, \ - (int)current->cred->fsgid); - /* Android 5.0 support */ /* Permission mode for a specific node. Controls how file permissions From e1feee06592832d3b9f51e0649b18d34f0a5e16e Mon Sep 17 00:00:00 2001 From: Prasanna Karthik Date: Tue, 8 Dec 2015 17:30:25 +0100 Subject: [PATCH 0724/3220] UPSTREAM: ARM: 8476/1: VDSO: use PTR_ERR_OR_ZERO for vma check (cherry pick from commit 38fc2f6c98262913388de338d5b0cda67e3f78cd) Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Signed-off-by: Prasanna Karthik Signed-off-by: Nathan Lynch Signed-off-by: Russell King Bug: 20045882 Bug: 19198045 Change-Id: Id4838948c19f031a66af1654a4c47668288a0037 --- arch/arm/kernel/vdso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 54a5aeab988d..994e971a8538 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -224,7 +224,7 @@ static int install_vvar(struct mm_struct *mm, unsigned long addr) VM_READ | VM_MAYREAD, &vdso_data_mapping); - return IS_ERR(vma) ? PTR_ERR(vma) : 0; + return PTR_ERR_OR_ZERO(vma); } /* assumes mmap_sem is write-locked */ From 510b819279cb6d4b803059ef66ec1bc05e187a04 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 14 Jan 2016 15:18:33 -0800 Subject: [PATCH 0725/3220] UPSTREAM: mm: add PHYS_PFN, use it in __phys_to_pfn() (cherry pick from commit 8f235d1a3eb7198affe7cadf676a10afb8a46a1a) __phys_to_pfn and __pfn_to_phys are symmetric, PHYS_PFN and PFN_PHYS are semmetric: - y = (phys_addr_t)x << PAGE_SHIFT - y >> PAGE_SHIFT = (phys_add_t)x - (unsigned long)(y >> PAGE_SHIFT) = x [akpm@linux-foundation.org: use macro arg name `x'] [arnd@arndb.de: include linux/pfn.h for PHYS_PFN definition] Signed-off-by: Chen Gang Cc: Oleg Nesterov Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Bug: 20045882 Bug: 19198045 Change-Id: If968d2246b381b9e5d6446e9d6d9fa45bb718e91 --- include/asm-generic/memory_model.h | 4 +++- include/linux/pfn.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 4b4b056a6eb0..5148150cc80b 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -1,6 +1,8 @@ #ifndef __ASM_MEMORY_MODEL_H #define __ASM_MEMORY_MODEL_H +#include + #ifndef __ASSEMBLY__ #if defined(CONFIG_FLATMEM) @@ -72,7 +74,7 @@ /* * Convert a physical address to a Page Frame Number and back */ -#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT)) +#define __phys_to_pfn(paddr) PHYS_PFN(paddr) #define __pfn_to_phys(pfn) PFN_PHYS(pfn) #define page_to_pfn __page_to_pfn diff --git a/include/linux/pfn.h b/include/linux/pfn.h index 7646637221f3..97f3e88aead4 100644 --- a/include/linux/pfn.h +++ b/include/linux/pfn.h @@ -9,5 +9,6 @@ #define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) #define PFN_DOWN(x) ((x) >> PAGE_SHIFT) #define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT) +#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT)) #endif From f186947ce6fb40a19ac180d819b7c5e733a3a229 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 13 Mar 2016 09:13:55 +0900 Subject: [PATCH 0726/3220] UPSTREAM: kbuild: drop FORCE from PHONY targets (cherry pick from commit 2e8d696b79e9c68d3005a9b09a8c72625d141ea6) These targets are marked as PHONY. No need to add FORCE to their dependency. Signed-off-by: Masahiro Yamada Signed-off-by: Michal Marek Bug: 20045882 Bug: 19198045 Change-Id: I718d46c339c99418d543cc64cabc851307e5abb7 --- Makefile | 8 ++++---- arch/arm/vdso/Makefile | 2 +- arch/ia64/Makefile | 4 ++-- arch/x86/entry/vdso/Makefile | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 3efe2ea99e2d..6450e18ebd7a 100644 --- a/Makefile +++ b/Makefile @@ -146,7 +146,7 @@ PHONY += $(MAKECMDGOALS) sub-make $(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make @: -sub-make: FORCE +sub-make: $(Q)$(MAKE) -C $(KBUILD_OUTPUT) KBUILD_SRC=$(CURDIR) \ -f $(CURDIR)/Makefile $(filter-out _all sub-make,$(MAKECMDGOALS)) @@ -1000,7 +1000,7 @@ prepare1: prepare2 $(version_h) include/generated/utsrelease.h \ archprepare: archheaders archscripts prepare1 scripts_basic -prepare0: archprepare FORCE +prepare0: archprepare $(Q)$(MAKE) $(build)=. # All the preparing.. @@ -1045,7 +1045,7 @@ INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib/firmware export INSTALL_FW_PATH PHONY += firmware_install -firmware_install: FORCE +firmware_install: @mkdir -p $(objtree)/firmware $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_install @@ -1065,7 +1065,7 @@ PHONY += archscripts archscripts: PHONY += __headers -__headers: $(version_h) scripts_basic asm-generic archheaders archscripts FORCE +__headers: $(version_h) scripts_basic asm-generic archheaders archscripts $(Q)$(MAKE) $(build)=scripts build_unifdef PHONY += headers_install_all diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 1160434eece0..59a8fa7b8a3b 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -74,5 +74,5 @@ $(MODLIB)/vdso: FORCE @mkdir -p $(MODLIB)/vdso PHONY += vdso_install -vdso_install: $(obj)/vdso.so.dbg $(MODLIB)/vdso FORCE +vdso_install: $(obj)/vdso.so.dbg $(MODLIB)/vdso $(call cmd,vdso_install) diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index 970d0bd99621..648f1cef33fa 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -95,8 +95,8 @@ define archhelp echo '* unwcheck - Check vmlinux for invalid unwind info' endef -archprepare: make_nr_irqs_h FORCE +archprepare: make_nr_irqs_h PHONY += make_nr_irqs_h FORCE -make_nr_irqs_h: FORCE +make_nr_irqs_h: $(Q)$(MAKE) $(build)=arch/ia64/kernel include/generated/nr-irqs.h diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 265c0ed68118..7af017a8958f 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -187,10 +187,10 @@ vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) $(MODLIB)/vdso: FORCE @mkdir -p $(MODLIB)/vdso -$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE +$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso $(call cmd,vdso_install) PHONY += vdso_install $(vdso_img_insttargets) -vdso_install: $(vdso_img_insttargets) FORCE +vdso_install: $(vdso_img_insttargets) clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so* From fb47dee8008c8424ab54543bb92f95505faca66d Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 12 May 2016 17:39:15 +0100 Subject: [PATCH 0727/3220] UPSTREAM: arm64: fix vdso-offsets.h dependency (cherry pick from commit a66649dab35033bd67988670fa60c268b0444cda) arm64/kernel/{vdso,signal}.c include vdso-offsets.h, as well as any file that includes asm/vdso.h. Therefore, vdso-offsets.h must be generated before these files are compiled. The current rules in arm64/kernel/Makefile do not actually enforce this, because even though $(obj)/vdso is listed as a prerequisite for vdso-offsets.h, this does not result in the intended effect of building the vdso subdirectory (before all the other objects). As a consequence, depending on the order in which the rules are followed, vdso-offsets.h is updated or not before arm64/kernel/{vdso,signal}.o are built. The current rules also impose an unnecessary dependency on vdso-offsets.h for all arm64/kernel/*.o, resulting in unnecessary rebuilds. This is made obvious when using make -j: touch arch/arm64/kernel/vdso/gettimeofday.S && make -j$NCPUS arch/arm64/kernel will sometimes result in none of arm64/kernel/*.o being rebuilt, sometimes all of them, or even just some of them. It is quite difficult to ensure that a header is generated before it is used with recursive Makefiles by using normal rules. Instead, arch-specific generated headers are normally built in the archprepare recipe in the arch Makefile (see for instance arch/ia64/Makefile). Unfortunately, asm-offsets.h is included in gettimeofday.S, and must therefore be generated before vdso-offsets.h, which is not the case if archprepare is used. For this reason, a rule run after archprepare has to be used. This commit adds rules in arm64/Makefile to build vdso-offsets.h during the prepare step, ensuring that vdso-offsets.h is generated before building anything. It also removes the now-unnecessary dependencies on vdso-offsets.h in arm64/kernel/Makefile. Finally, it removes the duplication of asm-offsets.h between arm64/kernel/vdso/ and include/generated/ and makes include/generated/vdso-offsets.h a target in arm64/kernel/vdso/Makefile. Cc: Will Deacon Cc: Michal Marek Signed-off-by: Kevin Brodsky Signed-off-by: Catalin Marinas Bug: 20045882 Bug: 19198045 Change-Id: Iaaf1c6d3c98287617fc7f63a736752f6597bc8d0 --- arch/arm64/Makefile | 10 ++++++++++ arch/arm64/kernel/Makefile | 4 ---- arch/arm64/kernel/vdso/Makefile | 7 +++---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 67074e612db2..5820cff5f843 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -128,6 +128,16 @@ archclean: $(Q)$(MAKE) $(clean)=$(boot) $(Q)$(MAKE) $(clean)=$(boot)/dts +# We need to generate vdso-offsets.h before compiling certain files in kernel/. +# In order to do that, we should use the archprepare target, but we can't since +# asm-offsets.h is included in some files used to generate vdso-offsets.h, and +# asm-offsets.h is built in prepare0, for which archprepare is a dependency. +# Therefore we need to generate the header after prepare0 has been made, hence +# this hack. +prepare: vdso_prepare +vdso_prepare: prepare0 + $(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso include/generated/vdso-offsets.h + define archhelp echo '* Image.gz - Compressed kernel image (arch/$(ARCH)/boot/Image.gz)' echo ' Image - Uncompressed kernel image (arch/$(ARCH)/boot/Image)' diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 49a2430b0786..b570e9e36812 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -49,7 +49,3 @@ obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) head-y := head.o extra-y += $(head-y) vmlinux.lds - -# vDSO - this must be built first to generate the symbol offsets -$(call objectify,$(arm64-obj-y)): $(obj)/vdso/vdso-offsets.h -$(obj)/vdso/vdso-offsets.h: $(obj)/vdso diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index b467fd0a384b..62c84f7cb01b 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -23,7 +23,7 @@ GCOV_PROFILE := n ccflags-y += -Wl,-shared obj-y += vdso.o -extra-y += vdso.lds vdso-offsets.h +extra-y += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) # Force dependency (incbin is bad) @@ -42,11 +42,10 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ define cmd_vdsosym - $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ && \ - cp $@ include/generated/ + $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ endef -$(obj)/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE +include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE $(call if_changed,vdsosym) # Assembly rules for the .S files From 9d2622be9a417c2de47bfe38aa617d1db51de518 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 12 Jul 2016 11:23:59 +0100 Subject: [PATCH 0728/3220] UPSTREAM: arm64: Refactor vDSO time functions (cherry pick from commit b33f491f5a9aaf171b7de0f905362eb0314af478) Time functions are directly implemented in assembly in arm64, and it is desirable to keep it this way for performance reasons (everything fits in registers, so that the stack is not used at all). However, the current implementation is quite difficult to read and understand (even considering it's assembly). Additionally, due to the structure of __kernel_clock_gettime, which heavily uses conditional branches to share code between the different clocks, it is difficult to support a new clock without making the branches even harder to follow. This commit completely refactors the structure of clock_gettime (and gettimeofday along the way) while keeping exactly the same algorithms. We no longer try to share code; instead, macros provide common operations. This new approach comes with a number of advantages: - In clock_gettime, clock implementations are no longer interspersed, making them much more readable. Additionally, macros only use registers passed as arguments or reserved with .req, this way it is easy to make sure that registers are properly allocated. To avoid a large number of branches in a given execution path, a jump table is used; a normal execution uses 3 unconditional branches. - __do_get_tspec has been replaced with 2 macros (get_ts_clock_mono, get_clock_shifted_nsec) and explicit loading of data from the vDSO page. Consequently, clock_gettime and gettimeofday are now leaf functions, and saving x30 (lr) is no longer necessary. - Variables protected by tb_seq_count are now loaded all at once, allowing to merge the seqcnt_read macro into seqcnt_check. - For CLOCK_REALTIME_COARSE, removed an unused load of the wall to monotonic timespec. - For CLOCK_MONOTONIC_COARSE, removed a few shift instructions. Obviously, the downside of sharing less code is an increase in code size. However since the vDSO has its own code page, this does not really matter, as long as the size of the DSO remains below 4 kB. For now this should be all right: Before After vdso.so size (B) 2776 3000 Signed-off-by: Kevin Brodsky Reviewed-by: Dave Martin Acked-by: Will Deacon Signed-off-by: Catalin Marinas Bug: 20045882 Bug: 19198045 Change-Id: I0a0036c54419ee561efe3aad97489e7748ddc59e --- arch/arm64/kernel/vdso/gettimeofday.S | 298 +++++++++++++++----------- 1 file changed, 172 insertions(+), 126 deletions(-) diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S index efa79e8d4196..c06919ee29e1 100644 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ b/arch/arm64/kernel/vdso/gettimeofday.S @@ -26,24 +26,100 @@ #define NSEC_PER_SEC_HI16 0x3b9a vdso_data .req x6 -use_syscall .req w7 -seqcnt .req w8 +seqcnt .req w7 +w_tmp .req w8 +x_tmp .req x8 + +/* + * Conventions for macro arguments: + * - An argument is write-only if its name starts with "res". + * - All other arguments are read-only, unless otherwise specified. + */ .macro seqcnt_acquire 9999: ldr seqcnt, [vdso_data, #VDSO_TB_SEQ_COUNT] tbnz seqcnt, #0, 9999b dmb ishld - ldr use_syscall, [vdso_data, #VDSO_USE_SYSCALL] .endm - .macro seqcnt_read, cnt + .macro seqcnt_check fail dmb ishld - ldr \cnt, [vdso_data, #VDSO_TB_SEQ_COUNT] + ldr w_tmp, [vdso_data, #VDSO_TB_SEQ_COUNT] + cmp w_tmp, seqcnt + b.ne \fail .endm - .macro seqcnt_check, cnt, fail - cmp \cnt, seqcnt - b.ne \fail + .macro syscall_check fail + ldr w_tmp, [vdso_data, #VDSO_USE_SYSCALL] + cbnz w_tmp, \fail + .endm + + .macro get_nsec_per_sec res + mov \res, #NSEC_PER_SEC_LO16 + movk \res, #NSEC_PER_SEC_HI16, lsl #16 + .endm + + /* + * Returns the clock delta, in nanoseconds left-shifted by the clock + * shift. + */ + .macro get_clock_shifted_nsec res, cycle_last, mult + /* Read the virtual counter. */ + isb + mrs x_tmp, cntvct_el0 + /* Calculate cycle delta and convert to ns. */ + sub \res, x_tmp, \cycle_last + /* We can only guarantee 56 bits of precision. */ + movn x_tmp, #0xff00, lsl #48 + and \res, x_tmp, \res + mul \res, \res, \mult + .endm + + /* + * Returns in res_{sec,nsec} the REALTIME timespec, based on the + * "wall time" (xtime) and the clock_mono delta. + */ + .macro get_ts_realtime res_sec, res_nsec, \ + clock_nsec, xtime_sec, xtime_nsec, nsec_to_sec + add \res_nsec, \clock_nsec, \xtime_nsec + udiv x_tmp, \res_nsec, \nsec_to_sec + add \res_sec, \xtime_sec, x_tmp + msub \res_nsec, x_tmp, \nsec_to_sec, \res_nsec + .endm + + /* sec and nsec are modified in place. */ + .macro add_ts sec, nsec, ts_sec, ts_nsec, nsec_to_sec + /* Add timespec. */ + add \sec, \sec, \ts_sec + add \nsec, \nsec, \ts_nsec + + /* Normalise the new timespec. */ + cmp \nsec, \nsec_to_sec + b.lt 9999f + sub \nsec, \nsec, \nsec_to_sec + add \sec, \sec, #1 +9999: + cmp \nsec, #0 + b.ge 9998f + add \nsec, \nsec, \nsec_to_sec + sub \sec, \sec, #1 +9998: + .endm + + .macro clock_gettime_return, shift=0 + .if \shift == 1 + lsr x11, x11, x12 + .endif + stp x10, x11, [x1, #TSPEC_TV_SEC] + mov x0, xzr + ret + .endm + + .macro jump_slot jumptable, index, label + .if (. - \jumptable) != 4 * (\index) + .error "Jump slot index mismatch" + .endif + b \label .endm .text @@ -51,18 +127,24 @@ seqcnt .req w8 /* int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); */ ENTRY(__kernel_gettimeofday) .cfi_startproc - mov x2, x30 - .cfi_register x30, x2 - - /* Acquire the sequence counter and get the timespec. */ adr vdso_data, _vdso_data -1: seqcnt_acquire - cbnz use_syscall, 4f - /* If tv is NULL, skip to the timezone code. */ cbz x0, 2f - bl __do_get_tspec - seqcnt_check w9, 1b + + /* Compute the time of day. */ +1: seqcnt_acquire + syscall_check fail=4f + ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] + ldp w11, w12, [vdso_data, #VDSO_CS_MULT] + ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] + seqcnt_check fail=1b + + get_nsec_per_sec res=x9 + lsl x9, x9, x12 + + get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + get_ts_realtime res_sec=x10, res_nsec=x11, \ + clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 /* Convert ns to us. */ mov x13, #1000 @@ -76,95 +158,102 @@ ENTRY(__kernel_gettimeofday) stp w4, w5, [x1, #TZ_MINWEST] 3: mov x0, xzr - ret x2 + ret 4: /* Syscall fallback. */ mov x8, #__NR_gettimeofday svc #0 - ret x2 + ret .cfi_endproc ENDPROC(__kernel_gettimeofday) +#define JUMPSLOT_MAX CLOCK_MONOTONIC_COARSE + /* int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); */ ENTRY(__kernel_clock_gettime) .cfi_startproc - cmp w0, #CLOCK_REALTIME - ccmp w0, #CLOCK_MONOTONIC, #0x4, ne - b.ne 2f - - mov x2, x30 - .cfi_register x30, x2 - - /* Get kernel timespec. */ + cmp w0, #JUMPSLOT_MAX + b.hi syscall adr vdso_data, _vdso_data -1: seqcnt_acquire - cbnz use_syscall, 7f + adr x_tmp, jumptable + add x_tmp, x_tmp, w0, uxtw #2 + br x_tmp - bl __do_get_tspec - seqcnt_check w9, 1b + ALIGN +jumptable: + jump_slot jumptable, CLOCK_REALTIME, realtime + jump_slot jumptable, CLOCK_MONOTONIC, monotonic + b syscall + b syscall + b syscall + jump_slot jumptable, CLOCK_REALTIME_COARSE, realtime_coarse + jump_slot jumptable, CLOCK_MONOTONIC_COARSE, monotonic_coarse - mov x30, x2 + .if (. - jumptable) != 4 * (JUMPSLOT_MAX + 1) + .error "Wrong jumptable size" + .endif - cmp w0, #CLOCK_MONOTONIC - b.ne 6f + ALIGN +realtime: + seqcnt_acquire + syscall_check fail=syscall + ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] + ldp w11, w12, [vdso_data, #VDSO_CS_MULT] + ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] + seqcnt_check fail=realtime - /* Get wtm timespec. */ - ldp x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC] + /* All computations are done with left-shifted nsecs. */ + get_nsec_per_sec res=x9 + lsl x9, x9, x12 - /* Check the sequence counter. */ - seqcnt_read w9 - seqcnt_check w9, 1b - b 4f -2: - cmp w0, #CLOCK_REALTIME_COARSE - ccmp w0, #CLOCK_MONOTONIC_COARSE, #0x4, ne - b.ne 8f + get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + get_ts_realtime res_sec=x10, res_nsec=x11, \ + clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 + clock_gettime_return, shift=1 - /* xtime_coarse_nsec is already right-shifted */ - mov x12, #0 + ALIGN +monotonic: + seqcnt_acquire + syscall_check fail=syscall + ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] + ldp w11, w12, [vdso_data, #VDSO_CS_MULT] + ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] + ldp x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC] + seqcnt_check fail=monotonic - /* Get coarse timespec. */ - adr vdso_data, _vdso_data -3: seqcnt_acquire + /* All computations are done with left-shifted nsecs. */ + lsl x4, x4, x12 + get_nsec_per_sec res=x9 + lsl x9, x9, x12 + + get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + get_ts_realtime res_sec=x10, res_nsec=x11, \ + clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 + + add_ts sec=x10, nsec=x11, ts_sec=x3, ts_nsec=x4, nsec_to_sec=x9 + clock_gettime_return, shift=1 + + ALIGN +realtime_coarse: + seqcnt_acquire ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC] + seqcnt_check fail=realtime_coarse + clock_gettime_return - /* Get wtm timespec. */ + ALIGN +monotonic_coarse: + seqcnt_acquire + ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC] ldp x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC] + seqcnt_check fail=monotonic_coarse - /* Check the sequence counter. */ - seqcnt_read w9 - seqcnt_check w9, 3b + /* Computations are done in (non-shifted) nsecs. */ + get_nsec_per_sec res=x9 + add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 + clock_gettime_return - cmp w0, #CLOCK_MONOTONIC_COARSE - b.ne 6f -4: - /* Add on wtm timespec. */ - add x10, x10, x13 - lsl x14, x14, x12 - add x11, x11, x14 - - /* Normalise the new timespec. */ - mov x15, #NSEC_PER_SEC_LO16 - movk x15, #NSEC_PER_SEC_HI16, lsl #16 - lsl x15, x15, x12 - cmp x11, x15 - b.lt 5f - sub x11, x11, x15 - add x10, x10, #1 -5: - cmp x11, #0 - b.ge 6f - add x11, x11, x15 - sub x10, x10, #1 - -6: /* Store to the user timespec. */ - lsr x11, x11, x12 - stp x10, x11, [x1, #TSPEC_TV_SEC] - mov x0, xzr - ret -7: - mov x30, x2 -8: /* Syscall fallback. */ + ALIGN +syscall: /* Syscall fallback. */ mov x8, #__NR_clock_gettime svc #0 ret @@ -203,46 +292,3 @@ ENTRY(__kernel_clock_getres) .quad CLOCK_COARSE_RES .cfi_endproc ENDPROC(__kernel_clock_getres) - -/* - * Read the current time from the architected counter. - * Expects vdso_data to be initialised. - * Clobbers the temporary registers (x9 - x15). - * Returns: - * - w9 = vDSO sequence counter - * - (x10, x11) = (ts->tv_sec, shifted ts->tv_nsec) - * - w12 = cs_shift - */ -ENTRY(__do_get_tspec) - .cfi_startproc - - /* Read from the vDSO data page. */ - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - ldp w11, w12, [vdso_data, #VDSO_CS_MULT] - seqcnt_read w9 - - /* Read the virtual counter. */ - isb - mrs x15, cntvct_el0 - - /* Calculate cycle delta and convert to ns. */ - sub x10, x15, x10 - /* We can only guarantee 56 bits of precision. */ - movn x15, #0xff00, lsl #48 - and x10, x15, x10 - mul x10, x10, x11 - - /* Use the kernel time to calculate the new timespec. */ - mov x11, #NSEC_PER_SEC_LO16 - movk x11, #NSEC_PER_SEC_HI16, lsl #16 - lsl x11, x11, x12 - add x15, x10, x14 - udiv x14, x15, x11 - add x10, x13, x14 - mul x13, x14, x11 - sub x11, x15, x13 - - ret - .cfi_endproc -ENDPROC(__do_get_tspec) From c2e0fb3550184ee2afeb98c7e8fd481ab750679a Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 12 Jul 2016 11:24:00 +0100 Subject: [PATCH 0729/3220] UPSTREAM: arm64: Add support for CLOCK_MONOTONIC_RAW in clock_gettime() vDSO (cherry pick from commit 49eea433b326a0ac5c7c941a011b2c65990bd19b) So far the arm64 clock_gettime() vDSO implementation only supported the following clocks, falling back to the syscall for the others: - CLOCK_REALTIME{,_COARSE} - CLOCK_MONOTONIC{,_COARSE} This patch adds support for the CLOCK_MONOTONIC_RAW clock, taking advantage of the recent refactoring of the vDSO time functions. Like the non-_COARSE clocks, this only works when the "arch_sys_counter" clocksource is in use (allowing us to read the current time from the virtual counter register), otherwise we also have to fall back to the syscall. Most of the data is shared with CLOCK_MONOTONIC, and the algorithm is similar. The reference implementation in kernel/time/timekeeping.c shows that: - CLOCK_MONOTONIC = tk->wall_to_monotonic + tk->xtime_sec + timekeeping_get_ns(&tk->tkr_mono) - CLOCK_MONOTONIC_RAW = tk->raw_time + timekeeping_get_ns(&tk->tkr_raw) - tkr_mono and tkr_raw are identical (in particular, same clocksource), except these members: * mult (only mono's multiplier is NTP-adjusted) * xtime_nsec (always 0 for raw) Therefore, tk->raw_time and tkr_raw->mult are now also stored in the vDSO data page. Cc: Ali Saidi Signed-off-by: Kevin Brodsky Reviewed-by: Dave Martin Acked-by: Will Deacon Signed-off-by: Catalin Marinas Bug: 20045882 Bug: 19198045 Change-Id: I854adcc1192757a1ac40662e85d466ca709d0682 --- arch/arm64/include/asm/vdso_datapage.h | 8 +++- arch/arm64/kernel/asm-offsets.c | 6 ++- arch/arm64/kernel/vdso.c | 8 +++- arch/arm64/kernel/vdso/gettimeofday.S | 57 +++++++++++++++++++++----- 4 files changed, 64 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/vdso_datapage.h b/arch/arm64/include/asm/vdso_datapage.h index de66199673d7..2b9a63771eda 100644 --- a/arch/arm64/include/asm/vdso_datapage.h +++ b/arch/arm64/include/asm/vdso_datapage.h @@ -22,6 +22,8 @@ struct vdso_data { __u64 cs_cycle_last; /* Timebase at clocksource init */ + __u64 raw_time_sec; /* Raw time */ + __u64 raw_time_nsec; __u64 xtime_clock_sec; /* Kernel time */ __u64 xtime_clock_nsec; __u64 xtime_coarse_sec; /* Coarse time */ @@ -29,8 +31,10 @@ struct vdso_data { __u64 wtm_clock_sec; /* Wall to monotonic time */ __u64 wtm_clock_nsec; __u32 tb_seq_count; /* Timebase sequence counter */ - __u32 cs_mult; /* Clocksource multiplier */ - __u32 cs_shift; /* Clocksource shift */ + /* cs_* members must be adjacent and in this order (ldp accesses) */ + __u32 cs_mono_mult; /* NTP-adjusted clocksource multiplier */ + __u32 cs_shift; /* Clocksource shift (mono = raw) */ + __u32 cs_raw_mult; /* Raw clocksource multiplier */ __u32 tz_minuteswest; /* Whacky timezone stuff */ __u32 tz_dsttime; __u32 use_syscall; diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 40ef661bd3a5..b84d8e85d19d 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -79,6 +79,7 @@ int main(void) BLANK(); DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); + DEFINE(CLOCK_MONOTONIC_RAW, CLOCK_MONOTONIC_RAW); DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC); DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); DEFINE(CLOCK_MONOTONIC_COARSE,CLOCK_MONOTONIC_COARSE); @@ -86,6 +87,8 @@ int main(void) DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); BLANK(); DEFINE(VDSO_CS_CYCLE_LAST, offsetof(struct vdso_data, cs_cycle_last)); + DEFINE(VDSO_RAW_TIME_SEC, offsetof(struct vdso_data, raw_time_sec)); + DEFINE(VDSO_RAW_TIME_NSEC, offsetof(struct vdso_data, raw_time_nsec)); DEFINE(VDSO_XTIME_CLK_SEC, offsetof(struct vdso_data, xtime_clock_sec)); DEFINE(VDSO_XTIME_CLK_NSEC, offsetof(struct vdso_data, xtime_clock_nsec)); DEFINE(VDSO_XTIME_CRS_SEC, offsetof(struct vdso_data, xtime_coarse_sec)); @@ -93,7 +96,8 @@ int main(void) DEFINE(VDSO_WTM_CLK_SEC, offsetof(struct vdso_data, wtm_clock_sec)); DEFINE(VDSO_WTM_CLK_NSEC, offsetof(struct vdso_data, wtm_clock_nsec)); DEFINE(VDSO_TB_SEQ_COUNT, offsetof(struct vdso_data, tb_seq_count)); - DEFINE(VDSO_CS_MULT, offsetof(struct vdso_data, cs_mult)); + DEFINE(VDSO_CS_MONO_MULT, offsetof(struct vdso_data, cs_mono_mult)); + DEFINE(VDSO_CS_RAW_MULT, offsetof(struct vdso_data, cs_raw_mult)); DEFINE(VDSO_CS_SHIFT, offsetof(struct vdso_data, cs_shift)); DEFINE(VDSO_TZ_MINWEST, offsetof(struct vdso_data, tz_minuteswest)); DEFINE(VDSO_TZ_DSTTIME, offsetof(struct vdso_data, tz_dsttime)); diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 97bc68f4c689..54f7b327fd18 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -212,10 +212,16 @@ void update_vsyscall(struct timekeeper *tk) vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec; if (!use_syscall) { + /* tkr_mono.cycle_last == tkr_raw.cycle_last */ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; + vdso_data->raw_time_sec = tk->raw_time.tv_sec; + vdso_data->raw_time_nsec = tk->raw_time.tv_nsec; vdso_data->xtime_clock_sec = tk->xtime_sec; vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; - vdso_data->cs_mult = tk->tkr_mono.mult; + /* tkr_raw.xtime_nsec == 0 */ + vdso_data->cs_mono_mult = tk->tkr_mono.mult; + vdso_data->cs_raw_mult = tk->tkr_raw.mult; + /* tkr_mono.shift == tkr_raw.shift */ vdso_data->cs_shift = tk->tkr_mono.shift; } diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S index c06919ee29e1..e00b4671bd7c 100644 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ b/arch/arm64/kernel/vdso/gettimeofday.S @@ -87,6 +87,15 @@ x_tmp .req x8 msub \res_nsec, x_tmp, \nsec_to_sec, \res_nsec .endm + /* + * Returns in res_{sec,nsec} the timespec based on the clock_raw delta, + * used for CLOCK_MONOTONIC_RAW. + */ + .macro get_ts_clock_raw res_sec, res_nsec, clock_nsec, nsec_to_sec + udiv \res_sec, \clock_nsec, \nsec_to_sec + msub \res_nsec, \res_sec, \nsec_to_sec, \clock_nsec + .endm + /* sec and nsec are modified in place. */ .macro add_ts sec, nsec, ts_sec, ts_nsec, nsec_to_sec /* Add timespec. */ @@ -135,7 +144,8 @@ ENTRY(__kernel_gettimeofday) 1: seqcnt_acquire syscall_check fail=4f ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - ldp w11, w12, [vdso_data, #VDSO_CS_MULT] + /* w11 = cs_mono_mult, w12 = cs_shift */ + ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] seqcnt_check fail=1b @@ -172,20 +182,20 @@ ENDPROC(__kernel_gettimeofday) /* int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); */ ENTRY(__kernel_clock_gettime) .cfi_startproc - cmp w0, #JUMPSLOT_MAX - b.hi syscall + cmp w0, #JUMPSLOT_MAX + b.hi syscall adr vdso_data, _vdso_data - adr x_tmp, jumptable - add x_tmp, x_tmp, w0, uxtw #2 - br x_tmp + adr x_tmp, jumptable + add x_tmp, x_tmp, w0, uxtw #2 + br x_tmp ALIGN jumptable: jump_slot jumptable, CLOCK_REALTIME, realtime jump_slot jumptable, CLOCK_MONOTONIC, monotonic - b syscall - b syscall - b syscall + b syscall + b syscall + jump_slot jumptable, CLOCK_MONOTONIC_RAW, monotonic_raw jump_slot jumptable, CLOCK_REALTIME_COARSE, realtime_coarse jump_slot jumptable, CLOCK_MONOTONIC_COARSE, monotonic_coarse @@ -198,7 +208,8 @@ realtime: seqcnt_acquire syscall_check fail=syscall ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - ldp w11, w12, [vdso_data, #VDSO_CS_MULT] + /* w11 = cs_mono_mult, w12 = cs_shift */ + ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] seqcnt_check fail=realtime @@ -216,7 +227,8 @@ monotonic: seqcnt_acquire syscall_check fail=syscall ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - ldp w11, w12, [vdso_data, #VDSO_CS_MULT] + /* w11 = cs_mono_mult, w12 = cs_shift */ + ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] ldp x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC] seqcnt_check fail=monotonic @@ -233,6 +245,28 @@ monotonic: add_ts sec=x10, nsec=x11, ts_sec=x3, ts_nsec=x4, nsec_to_sec=x9 clock_gettime_return, shift=1 + ALIGN +monotonic_raw: + seqcnt_acquire + syscall_check fail=syscall + ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] + /* w11 = cs_raw_mult, w12 = cs_shift */ + ldp w12, w11, [vdso_data, #VDSO_CS_SHIFT] + ldp x13, x14, [vdso_data, #VDSO_RAW_TIME_SEC] + seqcnt_check fail=monotonic_raw + + /* All computations are done with left-shifted nsecs. */ + lsl x14, x14, x12 + get_nsec_per_sec res=x9 + lsl x9, x9, x12 + + get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + get_ts_clock_raw res_sec=x10, res_nsec=x11, \ + clock_nsec=x15, nsec_to_sec=x9 + + add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 + clock_gettime_return, shift=1 + ALIGN realtime_coarse: seqcnt_acquire @@ -265,6 +299,7 @@ ENTRY(__kernel_clock_getres) .cfi_startproc cmp w0, #CLOCK_REALTIME ccmp w0, #CLOCK_MONOTONIC, #0x4, ne + ccmp w0, #CLOCK_MONOTONIC_RAW, #0x4, ne b.ne 1f ldr x2, 5f From b4b7a2466892929e5f72104089505f7b5bb7bfd6 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 15 Aug 2016 09:20:21 +0100 Subject: [PATCH 0730/3220] UPSTREAM: ARM: 8597/1: VDSO: put RO and RO after init objects into proper sections (cherry pick from commit 92bb8d5d55f7fe1a7a1201c42120c1611840807c) vdso_data_mapping is never modified, so mark it as const. vdso_total_pages, vdso_data_page, vdso_text_mapping and cntvct_ok are initialized by vdso_init(), thereafter are read only. The fact that they are read only after init makes them candidates for __ro_after_init declarations. Signed-off-by: Jisheng Zhang Reviewed-by: Kees Cook Acked-by: Nathan Lynch Signed-off-by: Russell King Bug: 20045882 Bug: 19198045 Change-Id: I0b2f9807d3cd66c5ef7457ac6fc180a41cbe05c6 --- arch/arm/kernel/vdso.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 994e971a8538..bbbffe946122 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -17,6 +17,7 @@ * along with this program. If not, see . */ +#include #include #include #include @@ -39,7 +40,7 @@ static struct page **vdso_text_pagelist; /* Total number of pages needed for the data and text portions of the VDSO. */ -unsigned int vdso_total_pages __read_mostly; +unsigned int vdso_total_pages __ro_after_init; /* * The VDSO data page. @@ -47,13 +48,13 @@ unsigned int vdso_total_pages __read_mostly; static union vdso_data_store vdso_data_store __page_aligned_data; static struct vdso_data *vdso_data = &vdso_data_store.data; -static struct page *vdso_data_page; -static struct vm_special_mapping vdso_data_mapping = { +static struct page *vdso_data_page __ro_after_init; +static const struct vm_special_mapping vdso_data_mapping = { .name = "[vvar]", .pages = &vdso_data_page, }; -static struct vm_special_mapping vdso_text_mapping = { +static struct vm_special_mapping vdso_text_mapping __ro_after_init = { .name = "[vdso]", }; @@ -67,7 +68,7 @@ struct elfinfo { /* Cached result of boot-time check for whether the arch timer exists, * and if so, whether the virtual counter is useable. */ -static bool cntvct_ok __read_mostly; +static bool cntvct_ok __ro_after_init; static bool __init cntvct_functional(void) { From 30e1bf349c52dd1dbb599562559b60dc7c702f3a Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 15 Aug 2016 14:45:44 +0800 Subject: [PATCH 0731/3220] UPSTREAM: arm64: vdso: add __init section marker to alloc_vectors_page (cherry pick from commit 1aed28f94ce6c1f6c24bcbbd5fcd749b55f65e9e) It is not needed after booting, this patch moves the alloc_vectors_page function to the __init section. Acked-by: Mark Rutland Signed-off-by: Jisheng Zhang Signed-off-by: Will Deacon Bug: 20045882 Bug: 19198045 Change-Id: I1d7b89f9e20890b18e77c8752f1eef33d721ea81 --- arch/arm64/kernel/vdso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 54f7b327fd18..0949b4812887 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -55,7 +55,7 @@ struct vdso_data *vdso_data = &vdso_data_store.data; */ static struct page *vectors_page[1]; -static int alloc_vectors_page(void) +static int __init alloc_vectors_page(void) { extern char __kuser_helper_start[], __kuser_helper_end[]; extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; From 0c61bc75ee662c79fc22f94979ca2cb6b5de0e54 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 15 Aug 2016 14:45:45 +0800 Subject: [PATCH 0732/3220] UPSTREAM: arm64: vdso: constify vm_special_mapping used for aarch32 vectors page (cherry pick from commit b6d081bddf397026575a437b603b118dff2606ff) The vm_special_mapping spec which is used for aarch32 vectors page is never modified, so mark it as const. Acked-by: Mark Rutland Signed-off-by: Jisheng Zhang Signed-off-by: Will Deacon Bug: 20045882 Bug: 19198045 Change-Id: If1afd11bfeefee45a2b96634b18143937a773a14 --- arch/arm64/kernel/vdso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 0949b4812887..3b8acfae7797 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -88,7 +88,7 @@ int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr = AARCH32_VECTORS_BASE; - static struct vm_special_mapping spec = { + static const struct vm_special_mapping spec = { .name = "[vectors]", .pages = vectors_page, From a08cafa7e09cb5450a2596e6ea15bdea976d8d59 Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 28 Mar 2017 13:30:18 -0700 Subject: [PATCH 0733/3220] ANDROID: ARM64: Allow to choose appended kernel image By default appended kernel image is Image.gz-dtb. New config option BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME allows to choose between Image.gz-dtb and Image-dtb. Change-Id: I1c71b85136f1beeb61782e4646820718c1ccd7e4 Signed-off-by: Dmitry Shmidt --- arch/arm64/Kconfig | 20 ++++++++++++++++++++ arch/arm64/Makefile | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 77bea9751a31..35be8566140e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -930,6 +930,26 @@ config BUILD_ARM64_APPENDED_DTB_IMAGE DTBs to be built by default (instead of a standalone Image.gz.) The image will built in arch/arm64/boot/Image.gz-dtb +choice + prompt "Appended DTB Kernel Image name" + depends on BUILD_ARM64_APPENDED_DTB_IMAGE + help + Enabling this option will cause a specific kernel image Image or + Image.gz to be used for final image creation. + The image will built in arch/arm64/boot/IMAGE-NAME-dtb + + config IMG_GZ_DTB + bool "Image.gz-dtb" + config IMG_DTB + bool "Image-dtb" +endchoice + +config BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME + string + depends on BUILD_ARM64_APPENDED_DTB_IMAGE + default "Image.gz-dtb" if IMG_GZ_DTB + default "Image-dtb" if IMG_DTB + config BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES string "Default dtb names" depends on BUILD_ARM64_APPENDED_DTB_IMAGE diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 5820cff5f843..ba55698f71e8 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -85,7 +85,7 @@ core-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a # Default target when executing plain make ifeq ($(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE),y) -KBUILD_IMAGE := Image.gz-dtb +KBUILD_IMAGE := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME)) else KBUILD_IMAGE := Image.gz endif From 55c20761453ed9b2d22f5c5101c3833dc1a7159a Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 11 Oct 2016 13:54:38 -0700 Subject: [PATCH 0734/3220] config: android: move device mapper options to recommended CONFIG_MD is in recommended, but other dependent options like DM_CRYPT and DM_VERITY options are in base. The result is the options in base don't get enabled when applying both base and recommended fragments. Move all the options to recommended. Link: http://lkml.kernel.org/r/20160908185934.18098-1-robh@kernel.org Signed-off-by: Rob Herring Acked-by: John Stultz Cc: Amit Pundir Cc: Dmitry Shmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit f023a3956f273859ed36f624f75a66c272124b16) Signed-off-by: Greg Kroah-Hartman --- android/configs/android-base.cfg | 4 ---- android/configs/android-recommended.cfg | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index d1f9628e4377..59d4908b3343 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -13,7 +13,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y CONFIG_ARMV8_DEPRECATED=y CONFIG_ASHMEM=y CONFIG_AUDIT=y -CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_INITRD=y CONFIG_CGROUPS=y CONFIG_CGROUP_CPUACCT=y @@ -21,9 +20,6 @@ CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_CP15_BARRIER_EMULATION=y -CONFIG_DM_CRYPT=y -CONFIG_DM_VERITY=y -CONFIG_DM_VERITY_FEC=y CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HARDENED_USERCOPY=y diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index 28610303db60..e346b378dd64 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -10,6 +10,7 @@ CONFIG_ANDROID_TIMED_GPIO=y CONFIG_ARM_KERNMEM_PERMS=y CONFIG_ARM64_SW_TTBR0_PAN=y CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 @@ -17,7 +18,10 @@ CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y CONFIG_CPU_SW_DOMAIN_PAN=y CONFIG_DEBUG_RODATA=y +CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y CONFIG_DRAGONRISE_FF=y CONFIG_ENABLE_DEFAULT_TRACERS=y CONFIG_EXT4_FS=y From 312b1f89aadbca0cc46f4cabad16f6b00f27c402 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 11 Oct 2016 13:54:41 -0700 Subject: [PATCH 0735/3220] UPSTREAM: config: android: set SELinux as default security mode Android won't boot without SELinux enabled, so make it the default. Link: http://lkml.kernel.org/r/20160908185934.18098-2-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit d90ae51a3e7556c9f50431db43cd8190934ccf94) Signed-off-by: Greg Kroah-Hartman --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 59d4908b3343..3672bf339006 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -20,6 +20,7 @@ CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_DEFAULT_SECURITY_SELINUX=y CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HARDENED_USERCOPY=y From b6cf5c77e06e10cb93e768f96893007cf54a77ae Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 11 Oct 2016 13:54:36 -0700 Subject: [PATCH 0736/3220] UPSTREAM: config/android: Remove CONFIG_IPV6_PRIVACY Option is long gone, see commit 5d9efa7ee99e ("ipv6: Remove privacy config option.") Link: http://lkml.kernel.org/r/20160811170340.9859-1-bp@alien8.de Signed-off-by: Borislav Petkov Cc: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit a2c6a235dbf4318fc7f7981932478e6c47f093ab) Signed-off-by: Greg Kroah-Hartman --- android/configs/android-base.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 3672bf339006..1ff32af855c3 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -41,7 +41,6 @@ CONFIG_IPV6=y CONFIG_IPV6_MIP6=y CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_IPV6_PRIVACY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y CONFIG_IP_ADVANCED_ROUTER=y From 635194062b5eb5248e7bf98ddbacf63c2ec67119 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Apr 2017 19:08:12 +0200 Subject: [PATCH 0737/3220] ANDROID: sort android-recommended.cfg It got out-of-order, so resort it to make it easier to sync with other trees. Signed-off-by: Greg Kroah-Hartman --- android/configs/android-recommended.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index e346b378dd64..eecf8d80453a 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -7,8 +7,8 @@ # CONFIG_PM_WAKELOCKS_GC is not set # CONFIG_VT is not set CONFIG_ANDROID_TIMED_GPIO=y -CONFIG_ARM_KERNMEM_PERMS=y CONFIG_ARM64_SW_TTBR0_PAN=y +CONFIG_ARM_KERNMEM_PERMS=y CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y @@ -97,6 +97,7 @@ CONFIG_LOGIRUMBLEPAD2_FF=y CONFIG_LOGITECH_FF=y CONFIG_MD=y CONFIG_MEDIA_SUPPORT=y +CONFIG_MEMORY_STATE_TIME=y CONFIG_MSDOS_FS=y CONFIG_PANIC_TIMEOUT=5 CONFIG_PANTHERLORD_FF=y @@ -126,7 +127,6 @@ CONFIG_TIMER_STATS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_UHID=y -CONFIG_MEMORY_STATE_TIME=y CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_HIDDEV=y From 025b221d5309fe93280e765cf16ad4c8d2ce76c4 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 31 Mar 2017 09:33:33 -0700 Subject: [PATCH 0738/3220] ANDROID: binder: add hwbinder,vndbinder to BINDER_DEVICES. These will be required going forward. Change-Id: Idf0593461cef88051564ae0df495c156e31048c4 Signed-off-by: Martijn Coenen --- android/configs/android-base.cfg | 1 + drivers/android/Kconfig | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 1ff32af855c3..ece06816e046 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -9,6 +9,7 @@ # CONFIG_USELIB is not set CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder CONFIG_ANDROID_LOW_MEMORY_KILLER=y CONFIG_ARMV8_DEPRECATED=y CONFIG_ASHMEM=y diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index a82fc022d34b..4d4cdc1a6e25 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -22,7 +22,7 @@ config ANDROID_BINDER_IPC config ANDROID_BINDER_DEVICES string "Android Binder devices" depends on ANDROID_BINDER_IPC - default "binder" + default "binder,hwbinder,vndbinder" ---help--- Default value for the binder.devices parameter. From 21afcf6ff8c745ab9c551f113eeaacff3f41b515 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 4 Apr 2017 19:50:20 +0200 Subject: [PATCH 0739/3220] ANDROID: android-base.cfg: properly sort the file It somehow got out of alphabetical order, fix it to make merges and testing easier. Signed-off-by: Greg Kroah-Hartman --- android/configs/android-base.cfg | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index ece06816e046..a1969d0d7926 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -8,8 +8,8 @@ # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set CONFIG_ANDROID=y -CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder +CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_LOW_MEMORY_KILLER=y CONFIG_ARMV8_DEPRECATED=y CONFIG_ASHMEM=y @@ -140,9 +140,9 @@ CONFIG_PREEMPT=y CONFIG_PROFILING=y CONFIG_QFMT_V2=y CONFIG_QUOTA=y +CONFIG_QUOTACTL=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_QUOTA_TREE=y -CONFIG_QUOTACTL=y CONFIG_RANDOMIZE_BASE=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y @@ -158,14 +158,14 @@ CONFIG_SYNC=y CONFIG_TUN=y CONFIG_UID_SYS_STATS=y CONFIG_UNIX=y -CONFIG_USB_GADGET=y CONFIG_USB_CONFIGFS=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_MTP=y -CONFIG_USB_CONFIGFS_F_PTP=y CONFIG_USB_CONFIGFS_F_ACC=y CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y -CONFIG_USB_CONFIGFS_UEVENT=y +CONFIG_USB_CONFIGFS_F_FS=y CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_MTP=y +CONFIG_USB_CONFIGFS_F_PTP=y +CONFIG_USB_CONFIGFS_UEVENT=y +CONFIG_USB_GADGET=y CONFIG_USB_OTG_WAKELOCK=y CONFIG_XFRM_USER=y From 8588d88f1a37342dc3e9c0000cdc45792e237f3e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 4 Apr 2017 19:45:38 +0200 Subject: [PATCH 0740/3220] ANDROID: android-base.cfg: add CONFIG_IKCONFIG option This adds CONFIG_IKCONFIG and CONFIG_IKCONFIG_PROC options, which are a requirement for the O release. Bug: 35803310 Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 7d9280f579ff0731facb1e10f32e4a88a07f33f8) --- android/configs/android-base.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index a1969d0d7926..7d4ce82f5b8e 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -26,6 +26,8 @@ CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HARDENED_USERCOPY=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y CONFIG_INET6_IPCOMP=y From 10610ac9af3a9645ad1c7ea876f8859dbb650251 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 4 Apr 2017 19:46:59 +0200 Subject: [PATCH 0741/3220] ANDROID: android-base.cfg: add CONFIG_MODULES option This adds CONFIG_MODULES, CONFIG_MODULE_UNLOAD, and CONFIG_MODVERSIONS which are required by the O release. Bug: 35803310 Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 56f22e654a311f3c2492b8b3609916265fe34e20) --- android/configs/android-base.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 7d4ce82f5b8e..59c4f3a0df19 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -65,6 +65,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y CONFIG_IP_NF_TARGET_NETMAP=y CONFIG_IP_NF_TARGET_REDIRECT=y CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y CONFIG_NET=y CONFIG_NETDEVICES=y CONFIG_NETFILTER=y From 31f9ce371384a31cac09e3b659d119716e1632e6 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Mon, 16 Jan 2017 17:35:52 +0900 Subject: [PATCH 0742/3220] android: base-cfg: enable CONFIG_INET_DIAG_DESTROY As of Android N, this is required to close sockets when a network disconnects. Change-Id: I9fe81c5fc5224c17bfd8d9e236ea9e436b5971cb (cherry picked from commit 4a15cee4bdaf764756e98cd8f03784f330459ab1) Signed-off-by: Greg Kroah-Hartman --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 59c4f3a0df19..30e01074f64a 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -29,6 +29,7 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_INET6_AH=y +CONFIG_INET6_DIAG_DESTROY=y CONFIG_INET6_ESP=y CONFIG_INET6_IPCOMP=y CONFIG_INET=y From e246a2f11fcca7702dbf526da235d37c828742ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 27 Sep 2016 23:57:58 -0700 Subject: [PATCH 0743/3220] UPSTREAM: ipv6 addrconf: implement RFC7559 router solicitation backoff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implements: https://tools.ietf.org/html/rfc7559 Backoff is performed according to RFC3315 section 14: https://tools.ietf.org/html/rfc3315#section-14 We allow setting /proc/sys/net/ipv6/conf/*/router_solicitations to a negative value meaning an unlimited number of retransmits, and we make this the new default (inline with the RFC). We also add a new setting: /proc/sys/net/ipv6/conf/*/router_solicitation_max_interval defaulting to 1 hour (per RFC recommendation). Signed-off-by: Maciej Żenczykowski Acked-by: Erik Kline Signed-off-by: David S. Miller (cherry picked from commit bd11f0741fa5a2c296629898ad07759dd12b35bb in DaveM's net-next/master, should make Linus' tree in 4.9-rc1) Change-Id: Ia32cdc5c61481893ef8040734e014bf2229fc39e --- include/linux/ipv6.h | 1 + include/net/addrconf.h | 3 ++- include/net/if_inet6.h | 1 + net/ipv6/addrconf.c | 51 ++++++++++++++++++++++++++++++++++++------ 4 files changed, 48 insertions(+), 8 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 1182f0e21697..a0fc3cf932af 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -18,6 +18,7 @@ struct ipv6_devconf { __s32 dad_transmits; __s32 rtr_solicits; __s32 rtr_solicit_interval; + __s32 rtr_solicit_max_interval; __s32 rtr_solicit_delay; __s32 force_mld_version; __s32 mldv1_unsolicited_report_interval; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 3275ddf9f00d..d540657819ef 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -1,8 +1,9 @@ #ifndef _ADDRCONF_H #define _ADDRCONF_H -#define MAX_RTR_SOLICITATIONS 3 +#define MAX_RTR_SOLICITATIONS -1 /* unlimited */ #define RTR_SOLICITATION_INTERVAL (4*HZ) +#define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 1c8b6820b694..515352c6280a 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -201,6 +201,7 @@ struct inet6_dev { struct ipv6_devstat stats; struct timer_list rs_timer; + __s32 rs_interval; /* in jiffies */ __u8 rs_probes; __u8 addr_gen_mode; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 860ffbe778cf..a942a18b7943 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -112,6 +112,27 @@ static inline u32 cstamp_delta(unsigned long cstamp) return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } +static inline s32 rfc3315_s14_backoff_init(s32 irt) +{ + /* multiply 'initial retransmission time' by 0.9 .. 1.1 */ + u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt; + do_div(tmp, 1000000); + return (s32)tmp; +} + +static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt) +{ + /* multiply 'retransmission timeout' by 1.9 .. 2.1 */ + u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt; + do_div(tmp, 1000000); + if ((s32)tmp > mrt) { + /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */ + tmp = (900000 + prandom_u32() % 200001) * (u64)mrt; + do_div(tmp, 1000000); + } + return (s32)tmp; +} + #ifdef CONFIG_SYSCTL static int addrconf_sysctl_register(struct inet6_dev *idev); static void addrconf_sysctl_unregister(struct inet6_dev *idev); @@ -187,6 +208,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, @@ -233,6 +255,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, @@ -3487,7 +3510,7 @@ static void addrconf_rs_timer(unsigned long data) if (idev->if_flags & IF_RA_RCVD) goto out; - if (idev->rs_probes++ < idev->cnf.rtr_solicits) { + if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) { write_unlock(&idev->lock); if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) ndisc_send_rs(dev, &lladdr, @@ -3496,11 +3519,13 @@ static void addrconf_rs_timer(unsigned long data) goto put; write_lock(&idev->lock); + idev->rs_interval = rfc3315_s14_backoff_update( + idev->rs_interval, idev->cnf.rtr_solicit_max_interval); /* The wait after the last probe can be shorter */ addrconf_mod_rs_timer(idev, (idev->rs_probes == idev->cnf.rtr_solicits) ? idev->cnf.rtr_solicit_delay : - idev->cnf.rtr_solicit_interval); + idev->rs_interval); } else { /* * Note: we do not support deprecated "all on-link" @@ -3728,7 +3753,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp); send_rs = send_mld && ipv6_accept_ra(ifp->idev) && - ifp->idev->cnf.rtr_solicits > 0 && + ifp->idev->cnf.rtr_solicits != 0 && (dev->flags&IFF_LOOPBACK) == 0; read_unlock_bh(&ifp->idev->lock); @@ -3750,10 +3775,11 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) write_lock_bh(&ifp->idev->lock); spin_lock(&ifp->lock); + ifp->idev->rs_interval = rfc3315_s14_backoff_init( + ifp->idev->cnf.rtr_solicit_interval); ifp->idev->rs_probes = 1; ifp->idev->if_flags |= IF_RS_SENT; - addrconf_mod_rs_timer(ifp->idev, - ifp->idev->cnf.rtr_solicit_interval); + addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval); spin_unlock(&ifp->lock); write_unlock_bh(&ifp->idev->lock); } @@ -4670,6 +4696,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; array[DEVCONF_RTR_SOLICIT_INTERVAL] = jiffies_to_msecs(cnf->rtr_solicit_interval); + array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] = + jiffies_to_msecs(cnf->rtr_solicit_max_interval); array[DEVCONF_RTR_SOLICIT_DELAY] = jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; @@ -4879,7 +4907,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) return -EINVAL; if (!ipv6_accept_ra(idev)) return -EINVAL; - if (idev->cnf.rtr_solicits <= 0) + if (idev->cnf.rtr_solicits == 0) return -EINVAL; write_lock_bh(&idev->lock); @@ -4904,8 +4932,10 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) if (update_rs) { idev->if_flags |= IF_RS_SENT; + idev->rs_interval = rfc3315_s14_backoff_init( + idev->cnf.rtr_solicit_interval); idev->rs_probes = 1; - addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval); + addrconf_mod_rs_timer(idev, idev->rs_interval); } /* Well, that's kinda nasty ... */ @@ -5542,6 +5572,13 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "router_solicitation_max_interval", + .data = &ipv6_devconf.rtr_solicit_max_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, { .procname = "router_solicitation_delay", .data = &ipv6_devconf.rtr_solicit_delay, From 3a75d7a94709a4cf17a059d669d27bc82bb6bf9b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 31 Mar 2017 17:34:55 +0200 Subject: [PATCH 0744/3220] Merge 4.4.59 into android-4.4 Changes in 4.4.59: xfrm: policy: init locks early xfrm_user: validate XFRM_MSG_NEWAE XFRMA_REPLAY_ESN_VAL replay_window xfrm_user: validate XFRM_MSG_NEWAE incoming ESN size harder virtio_balloon: init 1st buffer in stats vq pinctrl: qcom: Don't clear status bit on irq_unmask c6x/ptrace: Remove useless PTRACE_SETREGSET implementation h8300/ptrace: Fix incorrect register transfer count mips/ptrace: Preserve previous registers for short regset write sparc/ptrace: Preserve previous registers for short regset write metag/ptrace: Preserve previous registers for short regset write metag/ptrace: Provide default TXSTATUS for short NT_PRSTATUS metag/ptrace: Reject partial NT_METAG_RPIPE writes fscrypt: remove broken support for detecting keyring key revocation sched/rt: Add a missing rescheduling point Linux 4.4.59 Change-Id: Ifa35307b133cbf29d0a0084bb78a7b0436182b53 Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- arch/c6x/kernel/ptrace.c | 41 ------------------------------ arch/h8300/kernel/ptrace.c | 8 +++--- arch/metag/kernel/ptrace.c | 19 +++++++++++--- arch/mips/kernel/ptrace.c | 3 ++- arch/sparc/kernel/ptrace_64.c | 2 +- drivers/pinctrl/qcom/pinctrl-msm.c | 4 --- drivers/virtio/virtio_balloon.c | 2 ++ fs/ext4/crypto_key.c | 28 +++++--------------- fs/ext4/ext4.h | 14 +--------- fs/ext4/ext4_crypto.h | 1 - fs/f2fs/crypto_key.c | 28 +++++--------------- fs/f2fs/f2fs.h | 14 +--------- fs/f2fs/f2fs_crypto.h | 1 - kernel/sched/deadline.c | 3 +-- kernel/sched/rt.c | 3 +-- net/xfrm/xfrm_policy.c | 10 ++++---- net/xfrm/xfrm_user.c | 9 ++++++- 18 files changed, 57 insertions(+), 135 deletions(-) diff --git a/Makefile b/Makefile index 6450e18ebd7a..fe17b173d773 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 58 +SUBLEVEL = 59 EXTRAVERSION = NAME = Blurry Fish Butt diff --git a/arch/c6x/kernel/ptrace.c b/arch/c6x/kernel/ptrace.c index 3c494e84444d..a511ac16a8e3 100644 --- a/arch/c6x/kernel/ptrace.c +++ b/arch/c6x/kernel/ptrace.c @@ -69,46 +69,6 @@ static int gpr_get(struct task_struct *target, 0, sizeof(*regs)); } -static int gpr_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret; - struct pt_regs *regs = task_pt_regs(target); - - /* Don't copyin TSR or CSR */ - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - ®s, - 0, PT_TSR * sizeof(long)); - if (ret) - return ret; - - ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, - PT_TSR * sizeof(long), - (PT_TSR + 1) * sizeof(long)); - if (ret) - return ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - ®s, - (PT_TSR + 1) * sizeof(long), - PT_CSR * sizeof(long)); - if (ret) - return ret; - - ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, - PT_CSR * sizeof(long), - (PT_CSR + 1) * sizeof(long)); - if (ret) - return ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - ®s, - (PT_CSR + 1) * sizeof(long), -1); - return ret; -} - enum c6x_regset { REGSET_GPR, }; @@ -120,7 +80,6 @@ static const struct user_regset c6x_regsets[] = { .size = sizeof(u32), .align = sizeof(u32), .get = gpr_get, - .set = gpr_set }, }; diff --git a/arch/h8300/kernel/ptrace.c b/arch/h8300/kernel/ptrace.c index 92075544a19a..0dc1c8f622bc 100644 --- a/arch/h8300/kernel/ptrace.c +++ b/arch/h8300/kernel/ptrace.c @@ -95,7 +95,8 @@ static int regs_get(struct task_struct *target, long *reg = (long *)®s; /* build user regs in buffer */ - for (r = 0; r < ARRAY_SIZE(register_offset); r++) + BUILD_BUG_ON(sizeof(regs) % sizeof(long) != 0); + for (r = 0; r < sizeof(regs) / sizeof(long); r++) *reg++ = h8300_get_reg(target, r); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -113,7 +114,8 @@ static int regs_set(struct task_struct *target, long *reg; /* build user regs in buffer */ - for (reg = (long *)®s, r = 0; r < ARRAY_SIZE(register_offset); r++) + BUILD_BUG_ON(sizeof(regs) % sizeof(long) != 0); + for (reg = (long *)®s, r = 0; r < sizeof(regs) / sizeof(long); r++) *reg++ = h8300_get_reg(target, r); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -122,7 +124,7 @@ static int regs_set(struct task_struct *target, return ret; /* write back to pt_regs */ - for (reg = (long *)®s, r = 0; r < ARRAY_SIZE(register_offset); r++) + for (reg = (long *)®s, r = 0; r < sizeof(regs) / sizeof(long); r++) h8300_put_reg(target, r, *reg++); return 0; } diff --git a/arch/metag/kernel/ptrace.c b/arch/metag/kernel/ptrace.c index 7563628822bd..5e2dc7defd2c 100644 --- a/arch/metag/kernel/ptrace.c +++ b/arch/metag/kernel/ptrace.c @@ -24,6 +24,16 @@ * user_regset definitions. */ +static unsigned long user_txstatus(const struct pt_regs *regs) +{ + unsigned long data = (unsigned long)regs->ctx.Flags; + + if (regs->ctx.SaveMask & TBICTX_CBUF_BIT) + data |= USER_GP_REGS_STATUS_CATCH_BIT; + + return data; +} + int metag_gp_regs_copyout(const struct pt_regs *regs, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) @@ -62,9 +72,7 @@ int metag_gp_regs_copyout(const struct pt_regs *regs, if (ret) goto out; /* TXSTATUS */ - data = (unsigned long)regs->ctx.Flags; - if (regs->ctx.SaveMask & TBICTX_CBUF_BIT) - data |= USER_GP_REGS_STATUS_CATCH_BIT; + data = user_txstatus(regs); ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &data, 4*25, 4*26); if (ret) @@ -119,6 +127,7 @@ int metag_gp_regs_copyin(struct pt_regs *regs, if (ret) goto out; /* TXSTATUS */ + data = user_txstatus(regs); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &data, 4*25, 4*26); if (ret) @@ -244,6 +253,8 @@ int metag_rp_state_copyin(struct pt_regs *regs, unsigned long long *ptr; int ret, i; + if (count < 4*13) + return -EINVAL; /* Read the entire pipeline before making any changes */ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &rp, 0, 4*13); @@ -303,7 +314,7 @@ static int metag_tls_set(struct task_struct *target, const void *kbuf, const void __user *ubuf) { int ret; - void __user *tls; + void __user *tls = target->thread.tls_ptr; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1); if (ret) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 74d581569778..c95bf18260f8 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -485,7 +485,8 @@ static int fpr_set(struct task_struct *target, &target->thread.fpu, 0, sizeof(elf_fpregset_t)); - for (i = 0; i < NUM_FPU_REGS; i++) { + BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); + for (i = 0; i < NUM_FPU_REGS && count >= sizeof(elf_fpreg_t); i++) { err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpr_val, i * sizeof(elf_fpreg_t), (i + 1) * sizeof(elf_fpreg_t)); diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 9ddc4928a089..c1566170964f 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -311,7 +311,7 @@ static int genregs64_set(struct task_struct *target, } if (!ret) { - unsigned long y; + unsigned long y = regs->y; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &y, diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index 146264a41ec8..9736f9be5447 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -597,10 +597,6 @@ static void msm_gpio_irq_unmask(struct irq_data *d) spin_lock_irqsave(&pctrl->lock, flags); - val = readl(pctrl->regs + g->intr_status_reg); - val &= ~BIT(g->intr_status_bit); - writel(val, pctrl->regs + g->intr_status_reg); - val = readl(pctrl->regs + g->intr_cfg_reg); val |= BIT(g->intr_enable_bit); writel(val, pctrl->regs + g->intr_cfg_reg); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 56f7e2521202..01d15dca940e 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -416,6 +416,8 @@ static int init_vqs(struct virtio_balloon *vb) * Prime this virtqueue with one buffer so the hypervisor can * use it to signal us later (it can't be broken yet!). */ + update_balloon_stats(vb); + sg_init_one(&sg, vb->stats, sizeof vb->stats); if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL) < 0) diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 98d58c8ee804..22096e31a720 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -173,8 +173,6 @@ void ext4_free_crypt_info(struct ext4_crypt_info *ci) if (!ci) return; - if (ci->ci_keyring_key) - key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); kmem_cache_free(ext4_crypt_info_cachep, ci); } @@ -196,7 +194,7 @@ void ext4_free_encryption_info(struct inode *inode, ext4_free_crypt_info(ci); } -int _ext4_get_encryption_info(struct inode *inode) +int ext4_get_encryption_info(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_crypt_info *crypt_info; @@ -213,22 +211,15 @@ int _ext4_get_encryption_info(struct inode *inode) char mode; int res; + if (ei->i_crypt_info) + return 0; + if (!ext4_read_workqueue) { res = ext4_init_crypto(); if (res) return res; } -retry: - crypt_info = ACCESS_ONCE(ei->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - ext4_free_encryption_info(inode, crypt_info); - goto retry; - } - res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx)); @@ -251,7 +242,6 @@ retry: crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) @@ -294,7 +284,6 @@ retry: keyring_key = NULL; goto out; } - crypt_info->ci_keyring_key = keyring_key; if (keyring_key->type != &key_type_logon) { printk_once(KERN_WARNING "ext4: key type must be logon\n"); @@ -340,16 +329,13 @@ got_key: ext4_encryption_key_size(mode)); if (res) goto out; - memzero_explicit(raw_key, sizeof(raw_key)); - if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) { - ext4_free_crypt_info(crypt_info); - goto retry; - } - return 0; + if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY) res = 0; + key_put(keyring_key); ext4_free_crypt_info(crypt_info); memzero_explicit(raw_key, sizeof(raw_key)); return res; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8be0e4d330ab..ac720a31e4ff 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2331,23 +2331,11 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } /* crypto_key.c */ void ext4_free_crypt_info(struct ext4_crypt_info *ci); void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); -int _ext4_get_encryption_info(struct inode *inode); #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_has_encryption_key(struct inode *inode); -static inline int ext4_get_encryption_info(struct inode *inode) -{ - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return _ext4_get_encryption_info(inode); - return 0; -} +int ext4_get_encryption_info(struct inode *inode); static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) { diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index 5a1684bd083b..e52637d969db 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -80,7 +80,6 @@ struct ext4_crypt_info { char ci_filename_mode; char ci_flags; struct crypto_ablkcipher *ci_ctfm; - struct key *ci_keyring_key; char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; }; diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index 5de2d866a25c..18595d7a0efc 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -92,7 +92,6 @@ static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) if (!ci) return; - key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); kmem_cache_free(f2fs_crypt_info_cachep, ci); } @@ -113,7 +112,7 @@ void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci) f2fs_free_crypt_info(ci); } -int _f2fs_get_encryption_info(struct inode *inode) +int f2fs_get_encryption_info(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_crypt_info *crypt_info; @@ -129,18 +128,12 @@ int _f2fs_get_encryption_info(struct inode *inode) char mode; int res; + if (fi->i_crypt_info) + return 0; + res = f2fs_crypto_initialize(); if (res) return res; -retry: - crypt_info = ACCESS_ONCE(fi->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - f2fs_free_encryption_info(inode, crypt_info); - goto retry; - } res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, @@ -159,7 +152,6 @@ retry: crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) @@ -197,7 +189,6 @@ retry: keyring_key = NULL; goto out; } - crypt_info->ci_keyring_key = keyring_key; BUG_ON(keyring_key->type != &key_type_logon); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { @@ -230,17 +221,12 @@ retry: if (res) goto out; - memzero_explicit(raw_key, sizeof(raw_key)); - if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) { - f2fs_free_crypt_info(crypt_info); - goto retry; - } - return 0; - + if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY && !S_ISREG(inode->i_mode)) res = 0; - + key_put(keyring_key); f2fs_free_crypt_info(crypt_info); memzero_explicit(raw_key, sizeof(raw_key)); return res; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9db5500d63d9..b1aeca83f4be 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2149,7 +2149,6 @@ void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *); /* crypto_key.c */ void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *); -int _f2fs_get_encryption_info(struct inode *inode); /* crypto_fname.c */ bool f2fs_valid_filenames_enc_mode(uint32_t); @@ -2170,18 +2169,7 @@ void f2fs_exit_crypto(void); int f2fs_has_encryption_key(struct inode *); -static inline int f2fs_get_encryption_info(struct inode *inode) -{ - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return _f2fs_get_encryption_info(inode); - return 0; -} +int f2fs_get_encryption_info(struct inode *inode); void f2fs_fname_crypto_free_buffer(struct f2fs_str *); int f2fs_fname_setup_filename(struct inode *, const struct qstr *, diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h index c2c1c2b63b25..f113f1a1e8c1 100644 --- a/fs/f2fs/f2fs_crypto.h +++ b/fs/f2fs/f2fs_crypto.h @@ -79,7 +79,6 @@ struct f2fs_crypt_info { char ci_filename_mode; char ci_flags; struct crypto_ablkcipher *ci_ctfm; - struct key *ci_keyring_key; char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE]; }; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9d9eb50d4059..f10b1cb255b2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1800,12 +1800,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) queue_push_tasks(rq); -#else +#endif if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); -#endif } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8a16cba968c4..541b8494450e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2235,10 +2235,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) queue_push_tasks(rq); -#else +#endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio) resched_curr(rq); -#endif /* CONFIG_SMP */ } } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b5e665b3cfb0..36a50ef9295d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3030,6 +3030,11 @@ static int __net_init xfrm_net_init(struct net *net) { int rv; + /* Initialize the per-net locks here */ + spin_lock_init(&net->xfrm.xfrm_state_lock); + rwlock_init(&net->xfrm.xfrm_policy_lock); + mutex_init(&net->xfrm.xfrm_cfg_mutex); + rv = xfrm_statistics_init(net); if (rv < 0) goto out_statistics; @@ -3046,11 +3051,6 @@ static int __net_init xfrm_net_init(struct net *net) if (rv < 0) goto out; - /* Initialize the per-net locks here */ - spin_lock_init(&net->xfrm.xfrm_state_lock); - rwlock_init(&net->xfrm.xfrm_policy_lock); - mutex_init(&net->xfrm.xfrm_cfg_mutex); - return 0; out: diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 805681a7d356..7a5a64e70b4d 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -412,7 +412,14 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es up = nla_data(rp); ulen = xfrm_replay_state_esn_len(up); - if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen) + /* Check the overall length and the internal bitmap length to avoid + * potential overflow. */ + if (nla_len(rp) < ulen || + xfrm_replay_state_esn_len(replay_esn) != ulen || + replay_esn->bmp_len != up->bmp_len) + return -EINVAL; + + if (up->replay_window > up->bmp_len * sizeof(__u32) * 8) return -EINVAL; return 0; From 5156f3e9f222f9dd9bce5981b67b1517aa5625c9 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 9 Mar 2017 21:14:45 -0800 Subject: [PATCH 0745/3220] ANDROID: sdcardfs: remove unnecessary call to do_munmap Adapted from wrapfs commit 5be6de9ecf02 ("Wrapfs: use vm_munmap in ->mmap") commit 2c9f6014a8bb ("Wrapfs: remove unnecessary call to vm_unmap in ->mmap") Code is unnecessary and causes deadlocks in newer kernels. Signed-off-by: Erez Zadok Signed-off-by: Daniel Rosenberg Bug: 35766959 Change-Id: Ia252d60c60799d7e28fc5f1f0f5b5ec2430a2379 --- fs/sdcardfs/file.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index eee4eb5896e5..7e2c50b30782 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -176,12 +176,6 @@ static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma) goto out; } saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */ - err = do_munmap(current->mm, vma->vm_start, - vma->vm_end - vma->vm_start); - if (err) { - pr_err("sdcardfs: do_munmap failed %d\n", err); - goto out; - } } /* From b2fc288c811eb828012b13f014aafd4d0e2c59b4 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 9 Mar 2017 21:24:58 -0800 Subject: [PATCH 0746/3220] ANDROID: sdcardfs: copy lower inode attributes in ->ioctl Adapted from wrapfs commit fbc9c6f83ea6 ("Wrapfs: copy lower inode attributes in ->ioctl") commit e97d8e26cc9e ("Wrapfs: use file_inode helper") Some ioctls (e.g., EXT2_IOC_SETFLAGS) can change inode attributes, so copy them from lower inode. Signed-off-by: Erez Zadok Signed-off-by: Daniel Rosenberg Bug: 35766959 Change-Id: I0f12684b9dbd4088b4a622c7ea9c03087f40e572 --- fs/sdcardfs/file.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 7e2c50b30782..41a550fec3f2 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -113,6 +113,10 @@ static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd, if (lower_file->f_op->unlocked_ioctl) err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); + /* some ioctls can change inode attributes (EXT2_IOC_SETFLAGS) */ + if (!err) + sdcardfs_copy_and_fix_attrs(file_inode(file), + file_inode(lower_file)); out: return err; } From c4e9b94bcb00e00843c7907ea3656cc2dc34e385 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 9 Mar 2017 21:42:01 -0800 Subject: [PATCH 0747/3220] ANDROID: sdcardfs: fix ->llseek to update upper and lower offset Adapted from wrapfs commit 1d1d23a47baa ("Wrapfs: fix ->llseek to update upper and lower offsets") Fixes bug: xfstests generic/257. f_pos consistently is required by and only by dir_ops->wrapfs_readdir, main_ops is not affected. Signed-off-by: Erez Zadok Signed-off-by: Mengyang Li Signed-off-by: Daniel Rosenberg Bug: 35766959 Change-Id: I360a1368ac37ea8966910a58972b81504031d437 --- fs/sdcardfs/file.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 41a550fec3f2..f49df7ac370e 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -316,6 +316,29 @@ static int sdcardfs_fasync(int fd, struct file *file, int flag) return err; } +/* + * Sdcardfs cannot use generic_file_llseek as ->llseek, because it would + * only set the offset of the upper file. So we have to implement our + * own method to set both the upper and lower file offsets + * consistently. + */ +static loff_t sdcardfs_file_llseek(struct file *file, loff_t offset, int whence) +{ + int err; + struct file *lower_file; + + err = generic_file_llseek(file, offset, whence); + if (err < 0) + goto out; + + lower_file = sdcardfs_lower_file(file); + err = generic_file_llseek(lower_file, offset, whence); + +out: + return err; +} + + const struct file_operations sdcardfs_main_fops = { .llseek = generic_file_llseek, .read = sdcardfs_read, @@ -334,7 +357,7 @@ const struct file_operations sdcardfs_main_fops = { /* trimmed directory options */ const struct file_operations sdcardfs_dir_fops = { - .llseek = generic_file_llseek, + .llseek = sdcardfs_file_llseek, .read = generic_read_dir, .iterate = sdcardfs_readdir, .unlocked_ioctl = sdcardfs_unlocked_ioctl, From 9a1e24adfd17f380a62c0966a23198c9684724c7 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 9 Mar 2017 21:48:09 -0800 Subject: [PATCH 0748/3220] ANDROID: sdcardfs: add read_iter/write_iter opeations Adapted from wrapfs commit f398bf6a7377 ("Wrapfs: add read_iter/write_iter opeations") Signed-off-by: Erez Zadok Signed-off-by: Mengyang Li Signed-off-by: Daniel Rosenberg Bug: 35766959 Change-Id: I2b3de59c9682fc705bf21df0de6df81e76fd2e40 --- fs/sdcardfs/file.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index f49df7ac370e..c0146e03fa2e 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -338,6 +338,52 @@ out: return err; } +/* + * Sdcardfs read_iter, redirect modified iocb to lower read_iter + */ +ssize_t sdcardfs_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + int err; + struct file *file = iocb->ki_filp, *lower_file; + + lower_file = sdcardfs_lower_file(file); + if (!lower_file->f_op->read_iter) { + err = -EINVAL; + goto out; + } + + get_file(lower_file); /* prevent lower_file from being released */ + iocb->ki_filp = lower_file; + err = lower_file->f_op->read_iter(iocb, iter); + /* ? wait IO finish to update atime as ecryptfs ? */ + iocb->ki_filp = file; + fput(lower_file); +out: + return err; +} + +/* + * Sdcardfs write_iter, redirect modified iocb to lower write_iter + */ +ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + int err; + struct file *file = iocb->ki_filp, *lower_file; + + lower_file = sdcardfs_lower_file(file); + if (!lower_file->f_op->write_iter) { + err = -EINVAL; + goto out; + } + + get_file(lower_file); /* prevent lower_file from being released */ + iocb->ki_filp = lower_file; + err = lower_file->f_op->write_iter(iocb, iter); + iocb->ki_filp = file; + fput(lower_file); +out: + return err; +} const struct file_operations sdcardfs_main_fops = { .llseek = generic_file_llseek, @@ -353,6 +399,8 @@ const struct file_operations sdcardfs_main_fops = { .release = sdcardfs_file_release, .fsync = sdcardfs_fsync, .fasync = sdcardfs_fasync, + .read_iter = sdcardfs_read_iter, + .write_iter = sdcardfs_write_iter, }; /* trimmed directory options */ From fb12388ce4461c3af91174a0c7e67a81f4c0a758 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 9 Mar 2017 22:11:08 -0800 Subject: [PATCH 0749/3220] ANDROID: sdcardfs: use d_splice_alias adapted from wrapfs commit 9671770ff8b9 ("Wrapfs: use d_splice_alias") Refactor interpose code to allow lookup to use d_splice_alias. Signed-off-by: Erez Zadok Signed-off-by: Daniel Rosenberg Bug: 35766959 Change-Id: Icf51db8658202c48456724275b03dc77f73f585b --- fs/sdcardfs/lookup.c | 55 +++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index cfa2b12b6411..f10f7752f514 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -164,27 +164,25 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, u } /* - * Connect a sdcardfs inode dentry/inode with several lower ones. This is - * the classic stackable file system "vnode interposition" action. - * - * @dentry: sdcardfs's dentry which interposes on lower one - * @sb: sdcardfs's super_block - * @lower_path: the lower path (caller does path_get/put) + * Helper interpose routine, called directly by ->lookup to handle + * spliced dentries. */ -int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb, - struct path *lower_path, userid_t id) +static struct dentry *__sdcardfs_interpose(struct dentry *dentry, + struct super_block *sb, + struct path *lower_path, + userid_t id) { - int err = 0; struct inode *inode; struct inode *lower_inode; struct super_block *lower_sb; + struct dentry *ret_dentry; lower_inode = d_inode(lower_path->dentry); lower_sb = sdcardfs_lower_super(sb); /* check that the lower file system didn't cross a mount point */ if (lower_inode->i_sb != lower_sb) { - err = -EXDEV; + ret_dentry = ERR_PTR(-EXDEV); goto out; } @@ -196,14 +194,32 @@ int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb, /* inherit lower inode number for sdcardfs's inode */ inode = sdcardfs_iget(sb, lower_inode, id); if (IS_ERR(inode)) { - err = PTR_ERR(inode); + ret_dentry = ERR_CAST(inode); goto out; } - d_add(dentry, inode); + ret_dentry = d_splice_alias(inode, dentry); + dentry = ret_dentry ?: dentry; update_derived_permission_lock(dentry); out: - return err; + return ret_dentry; +} + +/* + * Connect an sdcardfs inode dentry/inode with several lower ones. This is + * the classic stackable file system "vnode interposition" action. + * + * @dentry: sdcardfs's dentry which interposes on lower one + * @sb: sdcardfs's super_block + * @lower_path: the lower path (caller does path_get/put) + */ +int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb, + struct path *lower_path, userid_t id) +{ + struct dentry *ret_dentry; + + ret_dentry = __sdcardfs_interpose(dentry, sb, lower_path, id); + return PTR_ERR(ret_dentry); } struct sdcardfs_name_data { @@ -244,6 +260,7 @@ static struct dentry *__sdcardfs_lookup(struct dentry *dentry, const struct qstr *name; struct path lower_path; struct qstr dname; + struct dentry *ret_dentry = NULL; struct sdcardfs_sb_info *sbi; sbi = SDCARDFS_SB(dentry->d_sb); @@ -330,9 +347,13 @@ put_name: } sdcardfs_set_lower_path(dentry, &lower_path); - err = sdcardfs_interpose(dentry, dentry->d_sb, &lower_path, id); - if (err) /* path_put underlying path on error */ + ret_dentry = + __sdcardfs_interpose(dentry, dentry->d_sb, &lower_path, id); + if (IS_ERR(ret_dentry)) { + err = PTR_ERR(ret_dentry); + /* path_put underlying path on error */ sdcardfs_put_reset_lower_path(dentry); + } goto out; } @@ -372,7 +393,9 @@ setup_lower: err = 0; out: - return ERR_PTR(err); + if (err) + return ERR_PTR(err); + return ret_dentry; } /* From 24c96f78361315e1f39d9bd9836ec2f3f94f19ee Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 9 Mar 2017 20:56:05 -0800 Subject: [PATCH 0750/3220] ANDROID: sdcardfs: update module info Signed-off-by: Daniel Rosenberg Change-Id: I958c7c226d4e9265fea8996803e5b004fb33d8ad --- fs/sdcardfs/main.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 7344635522cd..953d2156d2e9 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -471,10 +471,15 @@ static void __exit exit_sdcardfs_fs(void) pr_info("Completed sdcardfs module unload\n"); } -MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University" - " (http://www.fsl.cs.sunysb.edu/)"); -MODULE_DESCRIPTION("Wrapfs " SDCARDFS_VERSION - " (http://wrapfs.filesystems.org/)"); +/* Original wrapfs authors */ +MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University (http://www.fsl.cs.sunysb.edu/)"); + +/* Original sdcardfs authors */ +MODULE_AUTHOR("Woojoong Lee, Daeho Jeong, Kitae Lee, Yeongjin Gil System Memory Lab., Samsung Electronics"); + +/* Current maintainer */ +MODULE_AUTHOR("Daniel Rosenberg, Google"); +MODULE_DESCRIPTION("Sdcardfs " SDCARDFS_VERSION); MODULE_LICENSE("GPL"); module_init(init_sdcardfs_fs); From 6c8d409129bbebe36cde9f8e511011756216163a Mon Sep 17 00:00:00 2001 From: zhangshuxiao Date: Wed, 8 Mar 2017 16:53:24 +0800 Subject: [PATCH 0751/3220] staging: android: ashmem: lseek failed due to no FMODE_LSEEK. vfs_llseek will check whether the file mode has FMODE_LSEEK, no return failure. But ashmem can be lseek, so add FMODE_LSEEK to ashmem file. Change-Id: Ia78ef4c7c96adb89d52e70b63f7c00636fe60d01 Signed-off-by: zhangshuxiao --- drivers/staging/android/ashmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index 3f1133230a1a..e4530ac6d5d4 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -392,6 +392,7 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma) ret = PTR_ERR(vmfile); goto out; } + vmfile->f_mode |= FMODE_LSEEK; asma->file = vmfile; } get_file(asma->file); From c71ad0f6b384393d99188ad333429c145f5b594a Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 16 Nov 2016 17:31:31 +0000 Subject: [PATCH 0752/3220] BACKPORT: arm64: dts: juno: fix cluster sleep state entry latency on all SoC versions The core and the cluster sleep state entry latencies can't be same as cluster sleep involves more work compared to core level e.g. shared cache maintenance. Experiments have shown on an average about 100us more latency for the cluster sleep state compared to the core level sleep. This patch fixes the entry latency for the cluster sleep state. Fixes: 28e10a8f3a03 ("arm64: dts: juno: Add idle-states to device tree") Cc: Lorenzo Pieralisi Cc: "Jon Medhurst (Tixy)" Reviewed-by: Liviu Dudau Signed-off-by: Sudeep Holla Signed-off-by: Arnd Bergmann Fixes: Change-Id: I7b2d81fa66f8ce8b229457cfefff06e9edd545c7 (arm64: dts: juno: Add idle-states to device tree) (cherry picked from commit 909e481e2467f202b97d42beef246e8829416a85) Signed-off-by: Amit Pundir --- arch/arm64/boot/dts/arm/juno-r1.dts | 2 +- arch/arm64/boot/dts/arm/juno.dts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/arm/juno-r1.dts b/arch/arm64/boot/dts/arm/juno-r1.dts index 8826f834f54f..29315af22147 100644 --- a/arch/arm64/boot/dts/arm/juno-r1.dts +++ b/arch/arm64/boot/dts/arm/juno-r1.dts @@ -76,7 +76,7 @@ compatible = "arm,idle-state"; arm,psci-suspend-param = <0x1010000>; local-timer-stop; - entry-latency-us = <300>; + entry-latency-us = <400>; exit-latency-us = <1200>; min-residency-us = <2500>; }; diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts index f113a80519cd..a5004931e2f1 100644 --- a/arch/arm64/boot/dts/arm/juno.dts +++ b/arch/arm64/boot/dts/arm/juno.dts @@ -76,7 +76,7 @@ compatible = "arm,idle-state"; arm,psci-suspend-param = <0x1010000>; local-timer-stop; - entry-latency-us = <300>; + entry-latency-us = <400>; exit-latency-us = <1200>; min-residency-us = <2500>; }; From 6a3b9c4984f9edc5a136720e42f1d2ab387857a4 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 10 Apr 2017 18:35:25 +0000 Subject: [PATCH 0753/3220] Revert "Revert "CHROMIUM: android: binder: Fix potential scheduling-while-atomic"" This reverts commit 13c17d0179f9a055062f37e91a6f6cf00a249ebd. Change-Id: I8c3a7eefb72b85c0dd05996c2705636fcbc871f7 --- drivers/android/binder.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 9cf4f9bbc711..22025bcc38e9 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -417,6 +417,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) struct files_struct *files = proc->files; unsigned long rlim_cur; unsigned long irqs; + int ret; if (files == NULL) return -ESRCH; @@ -427,7 +428,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); unlock_task_sighand(proc->tsk, &irqs); - return __alloc_fd(files, 0, rlim_cur, flags); + preempt_enable_no_resched(); + ret = __alloc_fd(files, 0, rlim_cur, flags); + preempt_disable(); + + return ret; } /* @@ -436,8 +441,11 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { - if (proc->files) + if (proc->files) { + preempt_enable_no_resched(); __fd_install(proc->files, fd, file); + preempt_disable(); + } } /* From a2d978c2adcb7f06b24c4491188d122623384690 Mon Sep 17 00:00:00 2001 From: Lianwei Wang Date: Sun, 19 Jun 2016 23:52:27 -0700 Subject: [PATCH 0754/3220] UPSTREAM: PM / sleep: make PM notifiers called symmetrically (cherry picked from commit ea00f4f4f00cc2bc3b63ad512a4e6df3b20832b9) This makes pm notifier PREPARE/POST symmetrical: if PREPARE fails, we will only undo what ever happened on PREPARE. It fixes the unbalanced CPU hotplug enable in CPU PM notifier. Change-Id: I01dce3cc95c5d6b8913b7b6be301f2909258c745 Signed-off-by: Lianwei Wang Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 20 ++++++++++++-------- kernel/power/main.c | 11 +++++++++-- kernel/power/power.h | 2 ++ kernel/power/suspend.c | 10 ++++++---- kernel/power/user.c | 14 ++++++++------ 5 files changed, 37 insertions(+), 20 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 3124cebaec31..797f19e2aaa9 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -647,7 +647,7 @@ static void power_down(void) */ int hibernate(void) { - int error; + int error, nr_calls = 0; if (!hibernation_available()) { pr_debug("PM: Hibernation not available.\n"); @@ -662,9 +662,11 @@ int hibernate(void) } pm_prepare_console(); - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Exit; + } printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); @@ -714,7 +716,7 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: - pm_notifier_call_chain(PM_POST_HIBERNATION); + __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); pm_restore_console(); atomic_inc(&snapshot_device_available); Unlock: @@ -740,7 +742,7 @@ int hibernate(void) */ static int software_resume(void) { - int error; + int error, nr_calls = 0; unsigned int flags; /* @@ -827,9 +829,11 @@ static int software_resume(void) } pm_prepare_console(); - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Close_Finish; + } pr_debug("PM: Preparing processes for restore.\n"); error = freeze_processes(); @@ -855,7 +859,7 @@ static int software_resume(void) unlock_device_hotplug(); thaw_processes(); Finish: - pm_notifier_call_chain(PM_POST_RESTORE); + __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ diff --git a/kernel/power/main.c b/kernel/power/main.c index b2dd4d999900..68c0eaae8034 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_pm_notifier); -int pm_notifier_call_chain(unsigned long val) +int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls) { - int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); + int ret; + + ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL, + nr_to_call, nr_calls); return notifier_to_errno(ret); } +int pm_notifier_call_chain(unsigned long val) +{ + return __pm_notifier_call_chain(val, -1, NULL); +} /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; diff --git a/kernel/power/power.h b/kernel/power/power.h index caadb566e82b..436da5bc72f4 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -191,6 +191,8 @@ static inline void suspend_test_finish(const char *label) {} #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ +extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call, + int *nr_calls); extern int pm_notifier_call_chain(unsigned long val); #endif diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 024411816ccf..58209d8bfc56 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -268,16 +268,18 @@ static int suspend_test(int level) */ static int suspend_prepare(suspend_state_t state) { - int error; + int error, nr_calls = 0; if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Finish; + } trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); @@ -288,7 +290,7 @@ static int suspend_prepare(suspend_state_t state) suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); Finish: - pm_notifier_call_chain(PM_POST_SUSPEND); + __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL); pm_restore_console(); return error; } diff --git a/kernel/power/user.c b/kernel/power/user.c index 526e8911460a..35310b627388 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1); static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; - int error; + int error, nr_calls = 0; if (!hibernation_available()) return -EPERM; @@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; data->free_bitmaps = false; - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); if (error) - pm_notifier_call_chain(PM_POST_HIBERNATION); + __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL); } else { /* * Resuming. We may need to wait for the image device to @@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); if (!error) { error = create_basic_memory_bitmaps(); data->free_bitmaps = !error; - } + } else + nr_calls--; + if (error) - pm_notifier_call_chain(PM_POST_RESTORE); + __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); } if (error) atomic_inc(&snapshot_device_available); From 8eb24ae4dcd3e2b40a9b0f7ca585ff0dab2350c0 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 7 Apr 2017 15:22:19 -0700 Subject: [PATCH 0755/3220] UPSTREAM: checkpatch: special audit for revert commit line Currently checkpatch.pl does not recognize git's default commit revert message and will complain about the hash format. Add special audit for revert commit message line to fix it. Signed-off-by: Wei Wang Acked-by: Joe Perches Bug: 37158168 Test: checkpatch.pl --patch [diff] and no longer see failure Change-Id: I65cf9a46874621dd6d5c349d2d3ca3b862d61ba3 --- scripts/checkpatch.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 2b3c22808c3b..e70147742cce 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2348,6 +2348,7 @@ sub process { # Check for git id commit length and improperly formed commit descriptions if ($in_commit_log && !$commit_log_possible_stack_dump && + $line !~ /^This reverts commit [0-9a-f]{7,40}/ && ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i || ($line =~ /\b[0-9a-f]{12,40}\b/i && $line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i && From 2295052f9de4d8288015e7d97ab0363a71604e64 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 10 Apr 2017 20:54:30 -0700 Subject: [PATCH 0756/3220] ANDROID: sdcardfs: Directly pass lower file for mmap Instead of relying on a copy hack, pass the lower file as private data. This lets the kernel find the vma mapping for pages used by the file, allowing pages used by mapping to be reclaimed. This is adapted from following esdfs patches commit 0647e638d: ("esdfs: store lower file in vm_file for mmap") commit 064850866: ("esdfs: keep a counter for mmaped file") Change-Id: I75b74d1e5061db1b8c13be38d184e118c0851a1a Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/file.c | 3 +++ fs/sdcardfs/mmap.c | 57 ++++++++++++++++++---------------------------- 2 files changed, 25 insertions(+), 35 deletions(-) diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index c0146e03fa2e..1f6921e2ffbf 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -192,6 +192,9 @@ static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma) file->f_mapping->a_ops = &sdcardfs_aops; /* set our aops */ if (!SDCARDFS_F(file)->lower_vm_ops) /* save for our ->fault */ SDCARDFS_F(file)->lower_vm_ops = saved_vm_ops; + vma->vm_private_data = file; + get_file(lower_file); + vma->vm_file = lower_file; out: return err; diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c index 0d4089c62c3a..b61f82275e7d 100644 --- a/fs/sdcardfs/mmap.c +++ b/fs/sdcardfs/mmap.c @@ -23,60 +23,45 @@ static int sdcardfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int err; - struct file *file, *lower_file; + struct file *file; const struct vm_operations_struct *lower_vm_ops; - struct vm_area_struct lower_vma; - memcpy(&lower_vma, vma, sizeof(struct vm_area_struct)); - file = lower_vma.vm_file; + file = (struct file *)vma->vm_private_data; lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops; BUG_ON(!lower_vm_ops); - lower_file = sdcardfs_lower_file(file); - /* - * XXX: vm_ops->fault may be called in parallel. Because we have to - * resort to temporarily changing the vma->vm_file to point to the - * lower file, a concurrent invocation of sdcardfs_fault could see a - * different value. In this workaround, we keep a different copy of - * the vma structure in our stack, so we never expose a different - * value of the vma->vm_file called to us, even temporarily. A - * better fix would be to change the calling semantics of ->fault to - * take an explicit file pointer. - */ - lower_vma.vm_file = lower_file; - err = lower_vm_ops->fault(&lower_vma, vmf); + err = lower_vm_ops->fault(vma, vmf); return err; } +static void sdcardfs_vm_open(struct vm_area_struct *vma) +{ + struct file *file = (struct file *)vma->vm_private_data; + + get_file(file); +} + +static void sdcardfs_vm_close(struct vm_area_struct *vma) +{ + struct file *file = (struct file *)vma->vm_private_data; + + fput(file); +} + static int sdcardfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { int err = 0; - struct file *file, *lower_file; + struct file *file; const struct vm_operations_struct *lower_vm_ops; - struct vm_area_struct lower_vma; - memcpy(&lower_vma, vma, sizeof(struct vm_area_struct)); - file = lower_vma.vm_file; + file = (struct file *)vma->vm_private_data; lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops; BUG_ON(!lower_vm_ops); if (!lower_vm_ops->page_mkwrite) goto out; - lower_file = sdcardfs_lower_file(file); - /* - * XXX: vm_ops->page_mkwrite may be called in parallel. - * Because we have to resort to temporarily changing the - * vma->vm_file to point to the lower file, a concurrent - * invocation of sdcardfs_page_mkwrite could see a different - * value. In this workaround, we keep a different copy of the - * vma structure in our stack, so we never expose a different - * value of the vma->vm_file called to us, even temporarily. - * A better fix would be to change the calling semantics of - * ->page_mkwrite to take an explicit file pointer. - */ - lower_vma.vm_file = lower_file; - err = lower_vm_ops->page_mkwrite(&lower_vma, vmf); + err = lower_vm_ops->page_mkwrite(vma, vmf); out: return err; } @@ -99,4 +84,6 @@ const struct address_space_operations sdcardfs_aops = { const struct vm_operations_struct sdcardfs_vm_ops = { .fault = sdcardfs_fault, .page_mkwrite = sdcardfs_page_mkwrite, + .open = sdcardfs_vm_open, + .close = sdcardfs_vm_close, }; From 84b6001987a05f1d16bc3939f8ebf1cf9bd672e2 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Tue, 11 Apr 2017 21:39:47 +0000 Subject: [PATCH 0757/3220] Revert "Revert "Revert "CHROMIUM: android: binder: Fix potential scheduling-while-atomic""" This reverts commit 6a3b9c4984f9edc5a136720e42f1d2ab387857a4. Sigh. Confusion reigns. The rest of the preempt_disable patch is not in common, so this shouldn't be here afterall (it is in several downstream branches that therefore need this one too). Re-reverting. We don't want the preempt_disable stuff in common since fine-grained locking is coming soon. Change-Id: I2595516cab28041fa72f4a38692266a0f2a01ab4 --- drivers/android/binder.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 22025bcc38e9..9cf4f9bbc711 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -417,7 +417,6 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) struct files_struct *files = proc->files; unsigned long rlim_cur; unsigned long irqs; - int ret; if (files == NULL) return -ESRCH; @@ -428,11 +427,7 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); unlock_task_sighand(proc->tsk, &irqs); - preempt_enable_no_resched(); - ret = __alloc_fd(files, 0, rlim_cur, flags); - preempt_disable(); - - return ret; + return __alloc_fd(files, 0, rlim_cur, flags); } /* @@ -441,11 +436,8 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { - if (proc->files) { - preempt_enable_no_resched(); + if (proc->files) __fd_install(proc->files, fd, file); - preempt_disable(); - } } /* From 8affe1f70379abeafb9bc1ee81132beae3e76b93 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 29 Mar 2017 16:11:20 +0200 Subject: [PATCH 0758/3220] UPSTREAM: net/packet: fix overflow in check for priv area size Subtracting tp_sizeof_priv from tp_block_size and casting to int to check whether one is less then the other doesn't always work (both of them are unsigned ints). Compare them as is instead. Also cast tp_sizeof_priv to u64 before using BLK_PLUS_PRIV, as it can overflow inside BLK_PLUS_PRIV otherwise. Bug: 36725304 Upstream commit: 2b6867c2ce76c596676bec7d2d525af525fdc6e2 Signed-off-by: Andrey Konovalov Acked-by: Eric Dumazet Signed-off-by: David S. Miller Change-Id: I46bfbaf5f4a5d80f10ddce731a3030f191de4b28 --- net/packet/af_packet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3975ac809934..d76800108ddb 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4138,8 +4138,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; if (po->tp_version >= TPACKET_V3 && - (int)(req->tp_block_size - - BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0) + req->tp_block_size <= + BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv)) goto out; if (unlikely(req->tp_frame_size < po->tp_hdrlen + po->tp_reserve)) From a973601dcbc98f41c9085ce91a68f314c046cf9c Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 23 Dec 2016 00:33:57 +0900 Subject: [PATCH 0759/3220] UPSTREAM: net: ipv4: Don't crash if passing a null sk to ip_do_redirect. Commit e2d118a1cb5e ("net: inet: Support UID-based routing in IP protocols.") made ip_do_redirect call sock_net(sk) to determine the network namespace of the passed-in socket. This crashes if sk is NULL. Fix this by getting the network namespace from the skb instead. Fixes: e2d118a1cb5e ("net: inet: Support UID-based routing in IP protocols.") Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller Change-Id: I16a3c343cb142c482ca6dd363c28b3a12d73a46d Fixes: Change-Id: I910504b508948057912bc188fd1e8aca28294de3 ("net: inet: Support UID-based routing in IP protocols.") (cherry picked from commit 7d99569460eae28b187d574aec930a4cf8b90441) Signed-off-by: Amit Pundir --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9af0f461b00d..d162ce41f761 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -792,6 +792,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf struct rtable *rt; struct flowi4 fl4; const struct iphdr *iph = (const struct iphdr *) skb->data; + struct net *net = dev_net(skb->dev); int oif = skb->dev->ifindex; u8 tos = RT_TOS(iph->tos); u8 prot = iph->protocol; @@ -799,7 +800,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf rt = (struct rtable *) dst; - __build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0); + __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } From 2a8af3ae8d93b34b4162f7c05e1b0a63cd22ec07 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 10 Jan 2017 09:30:51 +0100 Subject: [PATCH 0760/3220] UPSTREAM: net: socket: Make unnecessarily global sockfs_setattr() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sockfs_setattr() static as it is not used outside of net/socket.c This fixes the following GCC warning: net/socket.c:534:5: warning: no previous prototype for ‘sockfs_setattr’ [-Wmissing-prototypes] Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Cc: Lorenzo Colitti Signed-off-by: Tobias Klauser Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller Change-Id: Ie613c441b3fe081bdaec8c480d3aade482873bf8 Fixes: Change-Id: Idbc3e9a0cec91c4c6e01916b967b6237645ebe59 ("net: core: Add a UID field to struct sock.") (cherry picked from commit dc647ec88e029307e60e6bf9988056605f11051a) Signed-off-by: Amit Pundir --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/socket.c b/net/socket.c index 1489761b371e..18aff3d804ec 100644 --- a/net/socket.c +++ b/net/socket.c @@ -520,7 +520,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, return used; } -int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) +static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) { int err = simple_setattr(dentry, iattr); From 9e242ec92f951165b52de630ca8ecc6a2748f655 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 11:49:09 -0500 Subject: [PATCH 0761/3220] BACKPORT: [UPSTREAM] mbcache2: reimplement mbcache (Cherry-pick from commit f9a61eb4e2471c56a63cd804c7474128138c38ac) Original mbcache was designed to have more features than what ext? filesystems ended up using. It supported entry being in more hashes, it had a home-grown rwlocking of each entry, and one cache could cache entries from multiple filesystems. This genericity also resulted in more complex locking, larger cache entries, and generally more code complexity. This is reimplementation of the mbcache functionality to exactly fit the purpose ext? filesystems use it for. Cache entries are now considerably smaller (7 instead of 13 longs), the code is considerably smaller as well (414 vs 913 lines of code), and IMO also simpler. The new code is also much more lightweight. I have measured the speed using artificial xattr-bench benchmark, which spawns P processes, each process sets xattr for F different files, and the value of xattr is randomly chosen from a pool of V values. Averages of runtimes for 5 runs for various combinations of parameters are below. The first value in each cell is old mbache, the second value is the new mbcache. V=10 F\P 1 2 4 8 16 32 64 10 0.158,0.157 0.208,0.196 0.500,0.277 0.798,0.400 3.258,0.584 13.807,1.047 61.339,2.803 100 0.172,0.167 0.279,0.222 0.520,0.275 0.825,0.341 2.981,0.505 12.022,1.202 44.641,2.943 1000 0.185,0.174 0.297,0.239 0.445,0.283 0.767,0.340 2.329,0.480 6.342,1.198 16.440,3.888 V=100 F\P 1 2 4 8 16 32 64 10 0.162,0.153 0.200,0.186 0.362,0.257 0.671,0.496 1.433,0.943 3.801,1.345 7.938,2.501 100 0.153,0.160 0.221,0.199 0.404,0.264 0.945,0.379 1.556,0.485 3.761,1.156 7.901,2.484 1000 0.215,0.191 0.303,0.246 0.471,0.288 0.960,0.347 1.647,0.479 3.916,1.176 8.058,3.160 V=1000 F\P 1 2 4 8 16 32 64 10 0.151,0.129 0.210,0.163 0.326,0.245 0.685,0.521 1.284,0.859 3.087,2.251 6.451,4.801 100 0.154,0.153 0.211,0.191 0.276,0.282 0.687,0.506 1.202,0.877 3.259,1.954 8.738,2.887 1000 0.145,0.179 0.202,0.222 0.449,0.319 0.899,0.333 1.577,0.524 4.221,1.240 9.782,3.579 V=10000 F\P 1 2 4 8 16 32 64 10 0.161,0.154 0.198,0.190 0.296,0.256 0.662,0.480 1.192,0.818 2.989,2.200 6.362,4.746 100 0.176,0.174 0.236,0.203 0.326,0.255 0.696,0.511 1.183,0.855 4.205,3.444 19.510,17.760 1000 0.199,0.183 0.240,0.227 1.159,1.014 2.286,2.154 6.023,6.039 ---,10.933 ---,36.620 V=100000 F\P 1 2 4 8 16 32 64 10 0.171,0.162 0.204,0.198 0.285,0.230 0.692,0.500 1.225,0.881 2.990,2.243 6.379,4.771 100 0.151,0.171 0.220,0.210 0.295,0.255 0.720,0.518 1.226,0.844 3.423,2.831 19.234,17.544 1000 0.192,0.189 0.249,0.225 1.162,1.043 2.257,2.093 5.853,4.997 ---,10.399 ---,32.198 We see that the new code is faster in pretty much all the cases and starting from 4 processes there are significant gains with the new code resulting in upto 20-times shorter runtimes. Also for large numbers of cached entries all values for the old code could not be measured as the kernel started hitting softlockups and died before the test completed. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Bug: 32461228 --- fs/Makefile | 2 +- fs/mbcache2.c | 359 +++++++++++++++++++++++++++++++++++++++ include/linux/mbcache2.h | 50 ++++++ 3 files changed, 410 insertions(+), 1 deletion(-) create mode 100644 fs/mbcache2.c create mode 100644 include/linux/mbcache2.h diff --git a/fs/Makefile b/fs/Makefile index 3b54070cd629..dee237540bc0 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o -obj-$(CONFIG_FS_MBCACHE) += mbcache.o +obj-$(CONFIG_FS_MBCACHE) += mbcache.o mbcache2.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o diff --git a/fs/mbcache2.c b/fs/mbcache2.c new file mode 100644 index 000000000000..5c3e1a8c38f6 --- /dev/null +++ b/fs/mbcache2.c @@ -0,0 +1,359 @@ +#include +#include +#include +#include +#include +#include +#include + +/* + * Mbcache is a simple key-value store. Keys need not be unique, however + * key-value pairs are expected to be unique (we use this fact in + * mb2_cache_entry_delete_block()). + * + * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. + * They use hash of a block contents as a key and block number as a value. + * That's why keys need not be unique (different xattr blocks may end up having + * the same hash). However block number always uniquely identifies a cache + * entry. + * + * We provide functions for creation and removal of entries, search by key, + * and a special "delete entry with given key-value pair" operation. Fixed + * size hash table is used for fast key lookups. + */ + +struct mb2_cache { + /* Hash table of entries */ + struct hlist_bl_head *c_hash; + /* log2 of hash table size */ + int c_bucket_bits; + /* Protects c_lru_list, c_entry_count */ + spinlock_t c_lru_list_lock; + struct list_head c_lru_list; + /* Number of entries in cache */ + unsigned long c_entry_count; + struct shrinker c_shrink; +}; + +static struct kmem_cache *mb2_entry_cache; + +/* + * mb2_cache_entry_create - create entry in cache + * @cache - cache where the entry should be created + * @mask - gfp mask with which the entry should be allocated + * @key - key of the entry + * @block - block that contains data + * + * Creates entry in @cache with key @key and records that data is stored in + * block @block. The function returns -EBUSY if entry with the same key + * and for the same block already exists in cache. Otherwise 0 is returned. + */ +int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, + sector_t block) +{ + struct mb2_cache_entry *entry, *dup; + struct hlist_bl_node *dup_node; + struct hlist_bl_head *head; + + entry = kmem_cache_alloc(mb2_entry_cache, mask); + if (!entry) + return -ENOMEM; + + INIT_LIST_HEAD(&entry->e_lru_list); + /* One ref for hash, one ref returned */ + atomic_set(&entry->e_refcnt, 1); + entry->e_key = key; + entry->e_block = block; + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + entry->e_hash_list_head = head; + hlist_bl_lock(head); + hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { + if (dup->e_key == key && dup->e_block == block) { + hlist_bl_unlock(head); + kmem_cache_free(mb2_entry_cache, entry); + return -EBUSY; + } + } + hlist_bl_add_head(&entry->e_hash_list, head); + hlist_bl_unlock(head); + + spin_lock(&cache->c_lru_list_lock); + list_add_tail(&entry->e_lru_list, &cache->c_lru_list); + /* Grab ref for LRU list */ + atomic_inc(&entry->e_refcnt); + cache->c_entry_count++; + spin_unlock(&cache->c_lru_list_lock); + + return 0; +} +EXPORT_SYMBOL(mb2_cache_entry_create); + +void __mb2_cache_entry_free(struct mb2_cache_entry *entry) +{ + kmem_cache_free(mb2_entry_cache, entry); +} +EXPORT_SYMBOL(__mb2_cache_entry_free); + +static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache, + struct mb2_cache_entry *entry, + u32 key) +{ + struct mb2_cache_entry *old_entry = entry; + struct hlist_bl_node *node; + struct hlist_bl_head *head; + + if (entry) + head = entry->e_hash_list_head; + else + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + hlist_bl_lock(head); + if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) + node = entry->e_hash_list.next; + else + node = hlist_bl_first(head); + while (node) { + entry = hlist_bl_entry(node, struct mb2_cache_entry, + e_hash_list); + if (entry->e_key == key) { + atomic_inc(&entry->e_refcnt); + goto out; + } + node = node->next; + } + entry = NULL; +out: + hlist_bl_unlock(head); + if (old_entry) + mb2_cache_entry_put(cache, old_entry); + + return entry; +} + +/* + * mb2_cache_entry_find_first - find the first entry in cache with given key + * @cache: cache where we should search + * @key: key to look for + * + * Search in @cache for entry with key @key. Grabs reference to the first + * entry found and returns the entry. + */ +struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, + u32 key) +{ + return __entry_find(cache, NULL, key); +} +EXPORT_SYMBOL(mb2_cache_entry_find_first); + +/* + * mb2_cache_entry_find_next - find next entry in cache with the same + * @cache: cache where we should search + * @entry: entry to start search from + * + * Finds next entry in the hash chain which has the same key as @entry. + * If @entry is unhashed (which can happen when deletion of entry races + * with the search), finds the first entry in the hash chain. The function + * drops reference to @entry and returns with a reference to the found entry. + */ +struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, + struct mb2_cache_entry *entry) +{ + return __entry_find(cache, entry, entry->e_key); +} +EXPORT_SYMBOL(mb2_cache_entry_find_next); + +/* mb2_cache_entry_delete_block - remove information about block from cache + * @cache - cache we work with + * @key - key of the entry to remove + * @block - block containing data for @key + * + * Remove entry from cache @cache with key @key with data stored in @block. + */ +void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, + sector_t block) +{ + struct hlist_bl_node *node; + struct hlist_bl_head *head; + struct mb2_cache_entry *entry; + + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + hlist_bl_lock(head); + hlist_bl_for_each_entry(entry, node, head, e_hash_list) { + if (entry->e_key == key && entry->e_block == block) { + /* We keep hash list reference to keep entry alive */ + hlist_bl_del_init(&entry->e_hash_list); + hlist_bl_unlock(head); + spin_lock(&cache->c_lru_list_lock); + if (!list_empty(&entry->e_lru_list)) { + list_del_init(&entry->e_lru_list); + cache->c_entry_count--; + atomic_dec(&entry->e_refcnt); + } + spin_unlock(&cache->c_lru_list_lock); + mb2_cache_entry_put(cache, entry); + return; + } + } + hlist_bl_unlock(head); +} +EXPORT_SYMBOL(mb2_cache_entry_delete_block); + +/* mb2_cache_entry_touch - cache entry got used + * @cache - cache the entry belongs to + * @entry - entry that got used + * + * Move entry in lru list to reflect the fact that it was used. + */ +void mb2_cache_entry_touch(struct mb2_cache *cache, + struct mb2_cache_entry *entry) +{ + spin_lock(&cache->c_lru_list_lock); + if (!list_empty(&entry->e_lru_list)) + list_move_tail(&cache->c_lru_list, &entry->e_lru_list); + spin_unlock(&cache->c_lru_list_lock); +} +EXPORT_SYMBOL(mb2_cache_entry_touch); + +static unsigned long mb2_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct mb2_cache *cache = container_of(shrink, struct mb2_cache, + c_shrink); + + return cache->c_entry_count; +} + +/* Shrink number of entries in cache */ +static unsigned long mb2_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + int nr_to_scan = sc->nr_to_scan; + struct mb2_cache *cache = container_of(shrink, struct mb2_cache, + c_shrink); + struct mb2_cache_entry *entry; + struct hlist_bl_head *head; + unsigned int shrunk = 0; + + spin_lock(&cache->c_lru_list_lock); + while (nr_to_scan-- && !list_empty(&cache->c_lru_list)) { + entry = list_first_entry(&cache->c_lru_list, + struct mb2_cache_entry, e_lru_list); + list_del_init(&entry->e_lru_list); + cache->c_entry_count--; + /* + * We keep LRU list reference so that entry doesn't go away + * from under us. + */ + spin_unlock(&cache->c_lru_list_lock); + head = entry->e_hash_list_head; + hlist_bl_lock(head); + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); + atomic_dec(&entry->e_refcnt); + } + hlist_bl_unlock(head); + if (mb2_cache_entry_put(cache, entry)) + shrunk++; + cond_resched(); + spin_lock(&cache->c_lru_list_lock); + } + spin_unlock(&cache->c_lru_list_lock); + + return shrunk; +} + +/* + * mb2_cache_create - create cache + * @bucket_bits: log2 of the hash table size + * + * Create cache for keys with 2^bucket_bits hash entries. + */ +struct mb2_cache *mb2_cache_create(int bucket_bits) +{ + struct mb2_cache *cache; + int bucket_count = 1 << bucket_bits; + int i; + + if (!try_module_get(THIS_MODULE)) + return NULL; + + cache = kzalloc(sizeof(struct mb2_cache), GFP_KERNEL); + if (!cache) + goto err_out; + cache->c_bucket_bits = bucket_bits; + INIT_LIST_HEAD(&cache->c_lru_list); + spin_lock_init(&cache->c_lru_list_lock); + cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), + GFP_KERNEL); + if (!cache->c_hash) { + kfree(cache); + goto err_out; + } + for (i = 0; i < bucket_count; i++) + INIT_HLIST_BL_HEAD(&cache->c_hash[i]); + + cache->c_shrink.count_objects = mb2_cache_count; + cache->c_shrink.scan_objects = mb2_cache_scan; + cache->c_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&cache->c_shrink); + + return cache; + +err_out: + module_put(THIS_MODULE); + return NULL; +} +EXPORT_SYMBOL(mb2_cache_create); + +/* + * mb2_cache_destroy - destroy cache + * @cache: the cache to destroy + * + * Free all entries in cache and cache itself. Caller must make sure nobody + * (except shrinker) can reach @cache when calling this. + */ +void mb2_cache_destroy(struct mb2_cache *cache) +{ + struct mb2_cache_entry *entry, *next; + + unregister_shrinker(&cache->c_shrink); + + /* + * We don't bother with any locking. Cache must not be used at this + * point. + */ + list_for_each_entry_safe(entry, next, &cache->c_lru_list, e_lru_list) { + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); + atomic_dec(&entry->e_refcnt); + } else + WARN_ON(1); + list_del(&entry->e_lru_list); + WARN_ON(atomic_read(&entry->e_refcnt) != 1); + mb2_cache_entry_put(cache, entry); + } + kfree(cache->c_hash); + kfree(cache); + module_put(THIS_MODULE); +} +EXPORT_SYMBOL(mb2_cache_destroy); + +static int __init mb2cache_init(void) +{ + mb2_entry_cache = kmem_cache_create("mbcache", + sizeof(struct mb2_cache_entry), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + BUG_ON(!mb2_entry_cache); + return 0; +} + +static void __exit mb2cache_exit(void) +{ + kmem_cache_destroy(mb2_entry_cache); +} + +module_init(mb2cache_init) +module_exit(mb2cache_exit) + +MODULE_AUTHOR("Jan Kara "); +MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h new file mode 100644 index 000000000000..b6f160ff2533 --- /dev/null +++ b/include/linux/mbcache2.h @@ -0,0 +1,50 @@ +#ifndef _LINUX_MB2CACHE_H +#define _LINUX_MB2CACHE_H + +#include +#include +#include +#include +#include + +struct mb2_cache; + +struct mb2_cache_entry { + /* LRU list - protected by cache->c_lru_list_lock */ + struct list_head e_lru_list; + /* Hash table list - protected by bitlock in e_hash_list_head */ + struct hlist_bl_node e_hash_list; + atomic_t e_refcnt; + /* Key in hash - stable during lifetime of the entry */ + u32 e_key; + /* Block number of hashed block - stable during lifetime of the entry */ + sector_t e_block; + /* Head of hash list (for list bit lock) - stable */ + struct hlist_bl_head *e_hash_list_head; +}; + +struct mb2_cache *mb2_cache_create(int bucket_bits); +void mb2_cache_destroy(struct mb2_cache *cache); + +int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, + sector_t block); +void __mb2_cache_entry_free(struct mb2_cache_entry *entry); +static inline int mb2_cache_entry_put(struct mb2_cache *cache, + struct mb2_cache_entry *entry) +{ + if (!atomic_dec_and_test(&entry->e_refcnt)) + return 0; + __mb2_cache_entry_free(entry); + return 1; +} + +void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, + sector_t block); +struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, + u32 key); +struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, + struct mb2_cache_entry *entry); +void mb2_cache_entry_touch(struct mb2_cache *cache, + struct mb2_cache_entry *entry); + +#endif /* _LINUX_MB2CACHE_H */ From e29de4e87171faa5d3ab882b0bc555bbd07c9c5b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 11:50:13 -0500 Subject: [PATCH 0762/3220] BACKPORT [UPSTREAM] ext4: convert to mbcache2 (Cherry-pick from commit 82939d7999dfc1f1998c4b1c12e2f19edbdff272) The conversion is generally straightforward. The only tricky part is that xattr block corresponding to found mbcache entry can get freed before we get buffer lock for that block. So we have to check whether the entry is still valid after getting buffer lock. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Bug; 32461228 --- fs/ext4/ext4.h | 2 +- fs/ext4/super.c | 7 ++- fs/ext4/xattr.c | 136 ++++++++++++++++++++++++------------------------ fs/ext4/xattr.h | 5 +- 4 files changed, 75 insertions(+), 75 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ac720a31e4ff..2b3ac9fa9460 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1442,7 +1442,7 @@ struct ext4_sb_info { struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; - struct mb_cache *s_mb_cache; + struct mb2_cache *s_mb_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 68345a9e59b8..bd8831bfbafe 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -816,7 +816,6 @@ static void ext4_put_super(struct super_block *sb) ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); - ext4_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); @@ -3833,7 +3832,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) no_journal: if (ext4_mballoc_ready) { - sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); + sbi->s_mb_cache = ext4_xattr_create_cache(); if (!sbi->s_mb_cache) { ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); goto failed_mount_wq; @@ -4065,6 +4064,10 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: + if (sbi->s_mb_cache) { + ext4_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 263002f0389d..f7455e668474 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include "ext4_jbd2.h" #include "ext4.h" @@ -80,10 +80,10 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); +static void ext4_xattr_cache_insert(struct mb2_cache *, struct buffer_head *); static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct ext4_xattr_header *, - struct mb_cache_entry **); + struct mb2_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); static int ext4_xattr_list(struct dentry *dentry, char *buffer, @@ -279,7 +279,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -426,7 +426,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -543,11 +543,8 @@ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh) { - struct mb_cache_entry *ce = NULL; int error = 0; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); - ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bh); if (error) @@ -555,9 +552,15 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, lock_buffer(bh); if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); + /* + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect freed block + */ + mb2_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash, + bh->b_blocknr); get_bh(bh); unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, @@ -565,8 +568,6 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, EXT4_FREE_BLOCKS_FORGET); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect @@ -779,17 +780,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search *s = &bs->s; - struct mb_cache_entry *ce = NULL; + struct mb2_cache_entry *ce = NULL; int error = 0; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); #define header(x) ((struct ext4_xattr_header *)(x)) if (i->value && i->value_len > sb->s_blocksize) return -ENOSPC; if (s->base) { - ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, - bs->bh->b_blocknr); BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); if (error) @@ -797,10 +796,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { - if (ce) { - mb_cache_entry_free(ce); - ce = NULL; - } + __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); + + /* + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect modified + * block + */ + mb2_cache_entry_delete_block(ext4_mb_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s); if (!error) { @@ -824,10 +828,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); - if (ce) { - mb_cache_entry_release(ce); - ce = NULL; - } ea_bdebug(bs->bh, "cloning"); s->base = kmalloc(bs->bh->b_size, GFP_NOFS); error = -ENOMEM; @@ -882,6 +882,31 @@ inserted: if (error) goto cleanup_dquot; lock_buffer(new_bh); + /* + * We have to be careful about races with + * freeing or rehashing of xattr block. Once we + * hold buffer lock xattr block's state is + * stable so we can check whether the block got + * freed / rehashed or not. Since we unhash + * mbcache entry under buffer lock when freeing + * / rehashing xattr block, checking whether + * entry is still hashed is reliable. + */ + if (hlist_bl_unhashed(&ce->e_hash_list)) { + /* + * Undo everything and check mbcache + * again. + */ + unlock_buffer(new_bh); + dquot_free_block(inode, + EXT4_C2B(EXT4_SB(sb), + 1)); + brelse(new_bh); + mb2_cache_entry_put(ext4_mb_cache, ce); + ce = NULL; + new_bh = NULL; + goto inserted; + } le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); ea_bdebug(new_bh, "reusing; refcount now=%d", le32_to_cpu(BHDR(new_bh)->h_refcount)); @@ -892,7 +917,8 @@ inserted: if (error) goto cleanup_dquot; } - mb_cache_entry_release(ce); + mb2_cache_entry_touch(ext4_mb_cache, ce); + mb2_cache_entry_put(ext4_mb_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ @@ -957,7 +983,7 @@ getblk_failed: cleanup: if (ce) - mb_cache_entry_release(ce); + mb2_cache_entry_put(ext4_mb_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); @@ -1518,17 +1544,6 @@ cleanup: brelse(bh); } -/* - * ext4_xattr_put_super() - * - * This is called when a file system is unmounted. - */ -void -ext4_xattr_put_super(struct super_block *sb) -{ - mb_cache_shrink(sb->s_bdev); -} - /* * ext4_xattr_cache_insert() * @@ -1538,28 +1553,18 @@ ext4_xattr_put_super(struct super_block *sb) * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) +ext4_xattr_cache_insert(struct mb2_cache *ext4_mb_cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); - struct mb_cache_entry *ce; int error; - ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); - if (!ce) { - ea_bdebug(bh, "out of memory"); - return; - } - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + error = mb2_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, + bh->b_blocknr); if (error) { - mb_cache_entry_free(ce); - if (error == -EBUSY) { + if (error == -EBUSY) ea_bdebug(bh, "already in cache"); - error = 0; - } - } else { + } else ea_bdebug(bh, "inserting [%x]", (int)hash); - mb_cache_entry_release(ce); - } } /* @@ -1612,26 +1617,19 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, */ static struct buffer_head * ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, - struct mb_cache_entry **pce) + struct mb2_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); - struct mb_cache_entry *ce; - struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb2_cache_entry *ce; + struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -again: - ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, - hash); + ce = mb2_cache_entry_find_first(ext4_mb_cache, hash); while (ce) { struct buffer_head *bh; - if (IS_ERR(ce)) { - if (PTR_ERR(ce) == -EAGAIN) - goto again; - break; - } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { EXT4_ERROR_INODE(inode, "block %lu read error", @@ -1647,7 +1645,7 @@ again: return bh; } brelse(bh); - ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + ce = mb2_cache_entry_find_next(ext4_mb_cache, ce); } return NULL; } @@ -1722,15 +1720,15 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #define HASH_BUCKET_BITS 10 -struct mb_cache * -ext4_xattr_create_cache(char *name) +struct mb2_cache * +ext4_xattr_create_cache(void) { - return mb_cache_create(name, HASH_BUCKET_BITS); + return mb2_cache_create(HASH_BUCKET_BITS); } -void ext4_xattr_destroy_cache(struct mb_cache *cache) +void ext4_xattr_destroy_cache(struct mb2_cache *cache) { if (cache) - mb_cache_destroy(cache); + mb2_cache_destroy(cache); } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index ddc0957760ba..10b0f7323ed6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_ extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); extern void ext4_xattr_delete_inode(handle_t *, struct inode *); -extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); @@ -124,8 +123,8 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); -extern struct mb_cache *ext4_xattr_create_cache(char *name); -extern void ext4_xattr_destroy_cache(struct mb_cache *); +extern struct mb2_cache *ext4_xattr_create_cache(void); +extern void ext4_xattr_destroy_cache(struct mb2_cache *); #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, From 356d8075469a6cab1ecea241923e56dd00641109 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 11:56:38 -0500 Subject: [PATCH 0763/3220] BACKPORT: [UPSTREAM] ext2: convert to mbcache2 (Cherry-pick from commit be0726d33cb8f411945884664924bed3cb8c70ee) The conversion is generally straightforward. We convert filesystem from a global cache to per-fs one. Similarly to ext4 the tricky part is that xattr block corresponding to found mbcache entry can get freed before we get buffer lock for that block. So we have to check whether the entry is still valid after getting the buffer lock. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Bug: 32461228 --- fs/ext2/ext2.h | 3 + fs/ext2/super.c | 25 ++++++--- fs/ext2/xattr.c | 143 +++++++++++++++++++++++------------------------- fs/ext2/xattr.h | 21 ++----- 4 files changed, 92 insertions(+), 100 deletions(-) diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 4c69c94cafd8..f98ce7e60a0f 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -61,6 +61,8 @@ struct ext2_block_alloc_info { #define rsv_start rsv_window._rsv_start #define rsv_end rsv_window._rsv_end +struct mb2_cache; + /* * second extended-fs super-block data in memory */ @@ -111,6 +113,7 @@ struct ext2_sb_info { * of the mount options. */ spinlock_t s_lock; + struct mb2_cache *s_mb_cache; }; static inline spinlock_t * diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 748d35afc902..111a31761ffa 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb) dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - ext2_xattr_put_super(sb); + if (sbi->s_mb_cache) { + ext2_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; @@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } + +#ifdef CONFIG_EXT2_FS_XATTR + sbi->s_mb_cache = ext2_xattr_create_cache(); + if (!sbi->s_mb_cache) { + ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount3; + } +#endif /* * set up enough so that it can read an inode */ @@ -1149,6 +1160,8 @@ cantfind_ext2: sb->s_id); goto failed_mount; failed_mount3: + if (sbi->s_mb_cache) + ext2_xattr_destroy_cache(sbi->s_mb_cache); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2"); static int __init init_ext2_fs(void) { - int err = init_ext2_xattr(); - if (err) - return err; + int err; + err = init_inodecache(); if (err) - goto out1; + return err; err = register_filesystem(&ext2_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); -out1: - exit_ext2_xattr(); return err; } @@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void) { unregister_filesystem(&ext2_fs_type); destroy_inodecache(); - exit_ext2_xattr(); } MODULE_AUTHOR("Remy Card and others"); diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index fa70848afa8f..24736c8b3d51 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -56,7 +56,7 @@ #include #include #include -#include +#include #include #include #include @@ -92,14 +92,12 @@ static int ext2_xattr_set2(struct inode *, struct buffer_head *, struct ext2_xattr_header *); -static int ext2_xattr_cache_insert(struct buffer_head *); +static int ext2_xattr_cache_insert(struct mb2_cache *, struct buffer_head *); static struct buffer_head *ext2_xattr_cache_find(struct inode *, struct ext2_xattr_header *); static void ext2_xattr_rehash(struct ext2_xattr_header *, struct ext2_xattr_entry *); -static struct mb_cache *ext2_xattr_cache; - static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL @@ -154,6 +152,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, size_t name_len, size; char *end; int error; + struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -198,7 +197,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", goto found; entry = next; } - if (ext2_xattr_cache_insert(bh)) + if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) ea_idebug(inode, "cache insert failed"); error = -ENODATA; goto cleanup; @@ -211,7 +210,7 @@ found: le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) goto bad_block; - if (ext2_xattr_cache_insert(bh)) + if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) ea_idebug(inode, "cache insert failed"); if (buffer) { error = -ERANGE; @@ -249,6 +248,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) char *end; size_t rest = buffer_size; int error; + struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -283,7 +283,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", goto bad_block; entry = next; } - if (ext2_xattr_cache_insert(bh)) + if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) ea_idebug(inode, "cache insert failed"); /* list the attribute names */ @@ -480,22 +480,23 @@ bad_block: ext2_error(sb, "ext2_xattr_set", /* Here we know that we can set the new attribute. */ if (header) { - struct mb_cache_entry *ce; - /* assert(header == HDR(bh)); */ - ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, - bh->b_blocknr); lock_buffer(bh); if (header->h_refcount == cpu_to_le32(1)) { + __u32 hash = le32_to_cpu(header->h_hash); + ea_bdebug(bh, "modifying in-place"); - if (ce) - mb_cache_entry_free(ce); + /* + * This must happen under buffer lock for + * ext2_xattr_set2() to reliably detect modified block + */ + mb2_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, + hash, bh->b_blocknr); + /* keep the buffer locked while modifying it. */ } else { int offset; - if (ce) - mb_cache_entry_release(ce); unlock_buffer(bh); ea_bdebug(bh, "cloning"); header = kmalloc(bh->b_size, GFP_KERNEL); @@ -623,6 +624,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; int error; + struct mb2_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; if (header) { new_bh = ext2_xattr_cache_find(inode, header); @@ -650,7 +652,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, don't need to change the reference count. */ new_bh = old_bh; get_bh(new_bh); - ext2_xattr_cache_insert(new_bh); + ext2_xattr_cache_insert(ext2_mb_cache, new_bh); } else { /* We need to allocate a new block */ ext2_fsblk_t goal = ext2_group_first_block_no(sb, @@ -671,7 +673,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, memcpy(new_bh->b_data, header, new_bh->b_size); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext2_xattr_cache_insert(new_bh); + ext2_xattr_cache_insert(ext2_mb_cache, new_bh); ext2_xattr_update_super_block(sb); } @@ -704,19 +706,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, error = 0; if (old_bh && old_bh != new_bh) { - struct mb_cache_entry *ce; - /* * If there was an old block and we are no longer using it, * release the old block. */ - ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev, - old_bh->b_blocknr); lock_buffer(old_bh); if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { + __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash); + + /* + * This must happen under buffer lock for + * ext2_xattr_set2() to reliably detect freed block + */ + mb2_cache_entry_delete_block(ext2_mb_cache, + hash, old_bh->b_blocknr); /* Free the old block. */ - if (ce) - mb_cache_entry_free(ce); ea_bdebug(old_bh, "freeing"); ext2_free_blocks(inode, old_bh->b_blocknr, 1); mark_inode_dirty(inode); @@ -727,8 +731,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, } else { /* Decrement the refcount only. */ le32_add_cpu(&HDR(old_bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); dquot_free_block_nodirty(inode, 1); mark_inode_dirty(inode); mark_buffer_dirty(old_bh); @@ -754,7 +756,6 @@ void ext2_xattr_delete_inode(struct inode *inode) { struct buffer_head *bh = NULL; - struct mb_cache_entry *ce; down_write(&EXT2_I(inode)->xattr_sem); if (!EXT2_I(inode)->i_file_acl) @@ -774,19 +775,22 @@ ext2_xattr_delete_inode(struct inode *inode) EXT2_I(inode)->i_file_acl); goto cleanup; } - ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr); lock_buffer(bh); if (HDR(bh)->h_refcount == cpu_to_le32(1)) { - if (ce) - mb_cache_entry_free(ce); + __u32 hash = le32_to_cpu(HDR(bh)->h_hash); + + /* + * This must happen under buffer lock for ext2_xattr_set2() to + * reliably detect freed block + */ + mb2_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, + hash, bh->b_blocknr); ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); unlock_buffer(bh); } else { le32_add_cpu(&HDR(bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount)); unlock_buffer(bh); @@ -802,18 +806,6 @@ cleanup: up_write(&EXT2_I(inode)->xattr_sem); } -/* - * ext2_xattr_put_super() - * - * This is called when a file system is unmounted. - */ -void -ext2_xattr_put_super(struct super_block *sb) -{ - mb_cache_shrink(sb->s_bdev); -} - - /* * ext2_xattr_cache_insert() * @@ -823,28 +815,20 @@ ext2_xattr_put_super(struct super_block *sb) * Returns 0, or a negative error number on failure. */ static int -ext2_xattr_cache_insert(struct buffer_head *bh) +ext2_xattr_cache_insert(struct mb2_cache *cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(HDR(bh)->h_hash); - struct mb_cache_entry *ce; int error; - ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); - if (!ce) - return -ENOMEM; - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + error = mb2_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr); if (error) { - mb_cache_entry_free(ce); if (error == -EBUSY) { ea_bdebug(bh, "already in cache (%d cache entries)", atomic_read(&ext2_xattr_cache->c_entry_count)); error = 0; } - } else { - ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, - atomic_read(&ext2_xattr_cache->c_entry_count)); - mb_cache_entry_release(ce); - } + } else + ea_bdebug(bh, "inserting [%x]", (int)hash); return error; } @@ -900,23 +884,17 @@ static struct buffer_head * ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) { __u32 hash = le32_to_cpu(header->h_hash); - struct mb_cache_entry *ce; + struct mb2_cache_entry *ce; + struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev, - hash); + ce = mb2_cache_entry_find_first(ext2_mb_cache, hash); while (ce) { struct buffer_head *bh; - if (IS_ERR(ce)) { - if (PTR_ERR(ce) == -EAGAIN) - goto again; - break; - } - bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_cache_find", @@ -924,7 +902,21 @@ again: inode->i_ino, (unsigned long) ce->e_block); } else { lock_buffer(bh); - if (le32_to_cpu(HDR(bh)->h_refcount) > + /* + * We have to be careful about races with freeing or + * rehashing of xattr block. Once we hold buffer lock + * xattr block's state is stable so we can check + * whether the block got freed / rehashed or not. + * Since we unhash mbcache entry under buffer lock when + * freeing / rehashing xattr block, checking whether + * entry is still hashed is reliable. + */ + if (hlist_bl_unhashed(&ce->e_hash_list)) { + mb2_cache_entry_put(ext2_mb_cache, ce); + unlock_buffer(bh); + brelse(bh); + goto again; + } else if (le32_to_cpu(HDR(bh)->h_refcount) > EXT2_XATTR_REFCOUNT_MAX) { ea_idebug(inode, "block %ld refcount %d>%d", (unsigned long) ce->e_block, @@ -933,13 +925,14 @@ again: } else if (!ext2_xattr_cmp(header, HDR(bh))) { ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); - mb_cache_entry_release(ce); + mb2_cache_entry_touch(ext2_mb_cache, ce); + mb2_cache_entry_put(ext2_mb_cache, ce); return bh; } unlock_buffer(bh); brelse(bh); } - ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + ce = mb2_cache_entry_find_next(ext2_mb_cache, ce); } return NULL; } @@ -1012,17 +1005,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header, #undef BLOCK_HASH_SHIFT -int __init -init_ext2_xattr(void) +#define HASH_BUCKET_BITS 10 + +struct mb2_cache *ext2_xattr_create_cache(void) { - ext2_xattr_cache = mb_cache_create("ext2_xattr", 6); - if (!ext2_xattr_cache) - return -ENOMEM; - return 0; + return mb2_cache_create(HASH_BUCKET_BITS); } -void -exit_ext2_xattr(void) +void ext2_xattr_destroy_cache(struct mb2_cache *cache) { - mb_cache_destroy(ext2_xattr_cache); + if (cache) + mb2_cache_destroy(cache); } diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 60edf298644e..6ea38aa9563a 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -53,6 +53,8 @@ struct ext2_xattr_entry { #define EXT2_XATTR_SIZE(size) \ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) +struct mb2_cache; + # ifdef CONFIG_EXT2_FS_XATTR extern const struct xattr_handler ext2_xattr_user_handler; @@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern void ext2_xattr_delete_inode(struct inode *); -extern void ext2_xattr_put_super(struct super_block *); -extern int init_ext2_xattr(void); -extern void exit_ext2_xattr(void); +extern struct mb2_cache *ext2_xattr_create_cache(void); +extern void ext2_xattr_destroy_cache(struct mb2_cache *cache); extern const struct xattr_handler *ext2_xattr_handlers[]; @@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode) { } -static inline void -ext2_xattr_put_super(struct super_block *sb) -{ -} - -static inline int -init_ext2_xattr(void) -{ - return 0; -} - -static inline void -exit_ext2_xattr(void) +static inline void ext2_xattr_destroy_cache(struct mb2_cache *cache) { } From d657201d48f0cff729c89fafeada716410934f5c Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 17 Apr 2017 17:11:38 -0700 Subject: [PATCH 0764/3220] Android: sdcardfs: Change cache GID value Change-Id: Ieb955dd26493da26a458bc20fbbe75bca32b094f Signed-off-by: Daniel Rosenberg Bug: 37193650 --- fs/sdcardfs/derived_perm.c | 2 +- fs/sdcardfs/multiuser.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index f82fedd29007..11cb402d3465 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -222,7 +222,7 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name) break; case PERM_ANDROID_PACKAGE_CACHE: if (info->d_uid != 0) - gid = multiuser_get_cache_gid(info->d_uid); + gid = multiuser_get_ext_cache_gid(info->d_uid); else gid = multiuser_get_uid(info->userid, uid); break; diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h index 2e89b5872314..d0c925cda299 100644 --- a/fs/sdcardfs/multiuser.h +++ b/fs/sdcardfs/multiuser.h @@ -23,6 +23,8 @@ #define AID_APP_END 19999 /* last app user */ #define AID_CACHE_GID_START 20000 /* start of gids for apps to mark cached data */ #define AID_EXT_GID_START 30000 /* start of gids for apps to mark external data */ +#define AID_EXT_CACHE_GID_START 40000 /* start of gids for apps to mark external cached data */ +#define AID_EXT_CACHE_GID_END 49999 /* end of gids for apps to mark external cached data */ #define AID_SHARED_GID_START 50000 /* start of gids for apps in each user to share */ typedef uid_t userid_t; @@ -33,9 +35,9 @@ static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id) return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET); } -static inline gid_t multiuser_get_cache_gid(uid_t uid) +static inline gid_t multiuser_get_ext_cache_gid(uid_t uid) { - return uid - AID_APP_START + AID_CACHE_GID_START; + return uid - AID_APP_START + AID_EXT_CACHE_GID_START; } static inline gid_t multiuser_get_ext_gid(uid_t uid) From 6f28e6ebccbaeee26ee72669215cf1e4d0d79537 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 18 Apr 2017 12:45:48 -0700 Subject: [PATCH 0765/3220] ANDROID: sdcardfs: ->iget fixes Adapted from wrapfs commit 8c49eaa0sb9c ("Wrapfs: ->iget fixes") Change where we igrab/iput to ensure we always hold a valid lower_inode. Return ENOMEM (not EACCES) if iget5_locked returns NULL. Signed-off-by: Erez Zadok Signed-off-by: Daniel Rosenberg Bug: 35766959 Change-Id: Id8d4e0c0cbc685a0a77685ce73c923e9a3ddc094 --- fs/sdcardfs/lookup.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index f10f7752f514..a0f221501b4c 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -91,7 +91,9 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, u struct sdcardfs_inode_info *info; struct inode_data data; struct inode *inode; /* the new inode to return */ - int err; + + if (!igrab(lower_inode)) + return ERR_PTR(-ESTALE); data.id = id; data.lower_inode = lower_inode; @@ -106,22 +108,19 @@ struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, u sdcardfs_inode_set, /* inode init function */ &data); /* data passed to test+set fxns */ if (!inode) { - err = -EACCES; iput(lower_inode); - return ERR_PTR(err); + return ERR_PTR(-ENOMEM); } - /* if found a cached inode, then just return it */ - if (!(inode->i_state & I_NEW)) + /* if found a cached inode, then just return it (after iput) */ + if (!(inode->i_state & I_NEW)) { + iput(lower_inode); return inode; + } /* initialize new inode */ info = SDCARDFS_I(inode); inode->i_ino = lower_inode->i_ino; - if (!igrab(lower_inode)) { - err = -ESTALE; - return ERR_PTR(err); - } sdcardfs_set_lower_inode(inode, lower_inode); inode->i_version++; From e92f72194da2e690d85fc736661d3f0d96825c57 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 18 Apr 2017 22:25:15 -0700 Subject: [PATCH 0766/3220] Android: sdcardfs: Don't do d_add for lower fs For file based encryption, ext4 explicitly does not create negative dentries for encrypted files. If you force one over it, the decrypted file will be hidden until the cache is cleared. Instead, just fail out. Signed-off-by: Daniel Rosenberg Bug: 37231161 Change-Id: Id2a9708dfa75e1c22f89915c529789caadd2ca4b --- fs/sdcardfs/lookup.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index a0f221501b4c..08f62fb3e075 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -368,17 +368,15 @@ put_name: dname.len = name->len; dname.hash = full_name_hash(dname.name, dname.len); lower_dentry = d_lookup(lower_dir_dentry, &dname); - if (lower_dentry) - goto setup_lower; - - lower_dentry = d_alloc(lower_dir_dentry, &dname); if (!lower_dentry) { - err = -ENOMEM; + /* We called vfs_path_lookup earlier, and did not get a negative + * dentry then. Don't confuse the lower filesystem by forcing one + * on it now... + */ + err = -ENOENT; goto out; } - d_add(lower_dentry, NULL); /* instantiate and hash */ -setup_lower: lower_path.dentry = lower_dentry; lower_path.mnt = mntget(lower_dir_mnt); sdcardfs_set_lower_path(dentry, &lower_path); From d04f4c7320bab619062472469d149387d20095f9 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 18 Apr 2017 22:49:38 -0700 Subject: [PATCH 0767/3220] Android: sdcardfs: Don't complain in fixup_lower_ownership Not all filesystems support changing the owner of a file. We shouldn't complain if it doesn't happen. Signed-off-by: Daniel Rosenberg Bug: 37488099 Change-Id: I403e44ab7230f176e6df82f6adb4e5c82ce57f33 --- fs/sdcardfs/derived_perm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 11cb402d3465..cddfc79ea365 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -252,7 +252,7 @@ retry_deleg: goto retry_deleg; } if (error) - pr_err("sdcardfs: Failed to touch up lower fs gid/uid.\n"); + pr_debug("sdcardfs: Failed to touch up lower fs gid/uid for %s\n", name); } sdcardfs_put_lower_path(dentry, &path); } From 0ed8679d0dbb02598c60cd4bd933f911bd636d01 Mon Sep 17 00:00:00 2001 From: Jiebing Li Date: Tue, 10 Mar 2015 11:24:00 +0800 Subject: [PATCH 0768/3220] ANDROID: usb: gadget: fix MTP enumeration issue under super speed mode MTP function doesn't show as a drive in Windows when the device is connected to PC's USB3 port, because device fails to respond ACK to BULK OUT transfer request. This patch modifies MTP OUT request length as multiple of MaxPacketSize per databook requirement in order to fix this issue. Patchset: mtp Change-Id: I090d7880ff00c499dc5ba7fd644b1fe7cd87fcb5 Signed-off-by: Jiebing Li Signed-off-by: Wang, Yu Signed-off-by: Russ Weight --- drivers/usb/gadget/function/f_mtp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index d8b69af6e335..b37cc8d87ca4 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -540,10 +540,12 @@ static ssize_t mtp_read(struct file *fp, char __user *buf, ssize_t r = count; unsigned xfer; int ret = 0; + size_t len; DBG(cdev, "mtp_read(%zu)\n", count); - if (count > MTP_BULK_BUFFER_SIZE) + len = usb_ep_align_maybe(cdev->gadget, dev->ep_out, count); + if (len > MTP_BULK_BUFFER_SIZE) return -EINVAL; /* we will block until we're online */ @@ -567,7 +569,7 @@ static ssize_t mtp_read(struct file *fp, char __user *buf, requeue_req: /* queue a request */ req = dev->rx_req[0]; - req->length = count; + req->length = len; dev->rx_done = 0; ret = usb_ep_queue(dev->ep_out, req, GFP_KERNEL); if (ret < 0) { From 53491d941217d00fdddda6b8b755ced36676f079 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 20 Apr 2017 12:13:31 -0700 Subject: [PATCH 0769/3220] Revert "Android: sdcardfs: Don't do d_add for lower fs" This reverts commit e92f72194da2e690d85fc736661d3f0d96825c57. This change caused issues for sdcardfs on top of vfat Signed-off-by: Daniel Rosenberg Change-Id: Ife2ac6d9af40e4ddb95b7261e1dad4d7817e3779 --- fs/sdcardfs/lookup.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 08f62fb3e075..a0f221501b4c 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -368,15 +368,17 @@ put_name: dname.len = name->len; dname.hash = full_name_hash(dname.name, dname.len); lower_dentry = d_lookup(lower_dir_dentry, &dname); + if (lower_dentry) + goto setup_lower; + + lower_dentry = d_alloc(lower_dir_dentry, &dname); if (!lower_dentry) { - /* We called vfs_path_lookup earlier, and did not get a negative - * dentry then. Don't confuse the lower filesystem by forcing one - * on it now... - */ - err = -ENOENT; + err = -ENOMEM; goto out; } + d_add(lower_dentry, NULL); /* instantiate and hash */ +setup_lower: lower_path.dentry = lower_dentry; lower_path.mnt = mntget(lower_dir_mnt); sdcardfs_set_lower_path(dentry, &lower_path); From 738e1a1a468217bd5cb3fdc19582d4a1778bb510 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Apr 2016 10:40:55 +0200 Subject: [PATCH 0770/3220] UPSTREAM: char: Drop bogus dependency of DEVPORT on !M68K (cherry pick from commit 309124e2648d668a0c23539c5078815660a4a850) According to full-history-linux commit d3794f4fa7c3edc3 ("[PATCH] M68k update (part 25)"), port operations are allowed on m68k if CONFIG_ISA is defined. However, commit 153dcc54df826d2f ("[PATCH] mem driver: fix conditional on isa i/o support") accidentally changed an "||" into an "&&", disabling it completely on m68k. This logic was retained when introducing the DEVPORT symbol in commit 4f911d64e04a44c4 ("Make /dev/port conditional on config symbol"). Drop the bogus dependency on !M68K to fix this. Fixes: 153dcc54df826d2f ("[PATCH] mem driver: fix conditional on isa i/o support") Signed-off-by: Geert Uytterhoeven Tested-by: Al Stone Signed-off-by: Greg Kroah-Hartman Bug: 37210310 Bug: 36604779 Change-Id: I9139bd8a5a6e9e39c2e428bde23a7d9be07e2f91 --- drivers/char/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index a043107da2af..b130d38cf55c 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -584,7 +584,6 @@ config TELCLOCK config DEVPORT bool - depends on !M68K depends on ISA || PCI default y From c0fb2f9e5814e4cf8d6e64c00bdacdbf1668ab62 Mon Sep 17 00:00:00 2001 From: Max Bires Date: Tue, 3 Jan 2017 08:18:07 -0800 Subject: [PATCH 0771/3220] UPSTREAM: char: lack of bool string made CONFIG_DEVPORT always on (cherry pick from commit f2cfa58b136e4b06a9b9db7af5ef62fbb5992f62) Without a bool string present, using "# CONFIG_DEVPORT is not set" in defconfig files would not actually unset devport. This esnured that /dev/port was always on, but there are reasons a user may wish to disable it (smaller kernel, attack surface reduction) if it's not being used. Adding a message here in order to make this user visible. Signed-off-by: Max Bires Acked-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman Bug: 37210310 Bug: 36604779 Change-Id: Ib1e947526f6c6f7cdf6389923287631056f32c36 --- drivers/char/Kconfig | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index b130d38cf55c..3143db57ce44 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -583,9 +583,12 @@ config TELCLOCK controlling the behavior of this hardware. config DEVPORT - bool + bool "/dev/port character device" depends on ISA || PCI default y + help + Say Y here if you want to support the /dev/port device. The /dev/port + device is similar to /dev/mem, but for I/O ports. source "drivers/s390/char/Kconfig" From ebff0104446a71da950709b94cd2d8d2c10c73be Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 13 Apr 2017 17:07:58 -0700 Subject: [PATCH 0772/3220] ANDROID: uid_sys_stats: reduce update_io_stats overhead Replaced read_lock with rcu_read_lock to reduce time that preemption is disabled. Added a function to update io stats for specific uid and moved hash table lookup, user_namespace out of loops. Bug: 37319300 Change-Id: I2b81b5cd3b6399b40d08c3c14b42cad044556970 Signed-off-by: Jin Qian --- drivers/misc/uid_sys_stats.c | 61 ++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 204b23484266..618617f3e867 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -237,28 +237,28 @@ static void clean_uid_io_last_stats(struct uid_entry *uid_entry, io_last->fsync -= task->ioac.syscfs; } -static void update_io_stats_locked(void) +static void update_io_stats_all_locked(void) { struct uid_entry *uid_entry; struct task_struct *task, *temp; struct io_stats *io_bucket, *io_curr, *io_last; + struct user_namespace *user_ns = current_user_ns(); unsigned long bkt; - - BUG_ON(!rt_mutex_is_locked(&uid_lock)); + uid_t uid; hash_for_each(hash_table, bkt, uid_entry, hash) memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, sizeof(struct io_stats)); - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(temp, task) { - uid_entry = find_or_register_uid(from_kuid_munged( - current_user_ns(), task_uid(task))); + uid = from_kuid_munged(user_ns, task_uid(task)); + uid_entry = find_or_register_uid(uid); if (!uid_entry) continue; add_uid_io_curr_stats(uid_entry, task); } while_each_thread(temp, task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); hash_for_each(hash_table, bkt, uid_entry, hash) { io_bucket = &uid_entry->io[uid_entry->state]; @@ -281,6 +281,47 @@ static void update_io_stats_locked(void) } } +static void update_io_stats_uid_locked(uid_t target_uid) +{ + struct uid_entry *uid_entry; + struct task_struct *task, *temp; + struct io_stats *io_bucket, *io_curr, *io_last; + struct user_namespace *user_ns = current_user_ns(); + + uid_entry = find_or_register_uid(target_uid); + if (!uid_entry) + return; + + memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, + sizeof(struct io_stats)); + + rcu_read_lock(); + do_each_thread(temp, task) { + if (from_kuid_munged(user_ns, task_uid(task)) != target_uid) + continue; + add_uid_io_curr_stats(uid_entry, task); + } while_each_thread(temp, task); + rcu_read_unlock(); + + io_bucket = &uid_entry->io[uid_entry->state]; + io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR]; + io_last = &uid_entry->io[UID_STATE_TOTAL_LAST]; + + io_bucket->read_bytes += + io_curr->read_bytes - io_last->read_bytes; + io_bucket->write_bytes += + io_curr->write_bytes - io_last->write_bytes; + io_bucket->rchar += io_curr->rchar - io_last->rchar; + io_bucket->wchar += io_curr->wchar - io_last->wchar; + io_bucket->fsync += io_curr->fsync - io_last->fsync; + + io_last->read_bytes = io_curr->read_bytes; + io_last->write_bytes = io_curr->write_bytes; + io_last->rchar = io_curr->rchar; + io_last->wchar = io_curr->wchar; + io_last->fsync = io_curr->fsync; +} + static int uid_io_show(struct seq_file *m, void *v) { struct uid_entry *uid_entry; @@ -288,7 +329,7 @@ static int uid_io_show(struct seq_file *m, void *v) rt_mutex_lock(&uid_lock); - update_io_stats_locked(); + update_io_stats_all_locked(); hash_for_each(hash_table, bkt, uid_entry, hash) { seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", @@ -363,7 +404,7 @@ static ssize_t uid_procstat_write(struct file *file, return count; } - update_io_stats_locked(); + update_io_stats_uid_locked(uid); uid_entry->state = state; @@ -401,7 +442,7 @@ static int process_notifier(struct notifier_block *self, uid_entry->utime += utime; uid_entry->stime += stime; - update_io_stats_locked(); + update_io_stats_uid_locked(uid); clean_uid_io_last_stats(uid_entry, task); exit: From b834e929774513db929303125486c2c34bedba73 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Mon, 24 Apr 2017 15:50:31 +0530 Subject: [PATCH 0773/3220] Revert "USB: gadget: u_ether: Fix data stall issue in RNDIS tethering mode" This reverts commit 78281f6ed79413e5a16eac6edc4a72c6836ae5a9. This data stall fix is no longer required in AOSP. It is already skipped in android-4.9 patchset. Also core change from this data stall fix is already undone by android-4.4 merge commit 324e88de4aba ("Merge tag 'v4.4.32' into android-4.4.y"). This revert patch just clean up the left overs. It also reverts the compile fix from Change-Id: I38c4f4a850b0329fb4a06b2c7e45558e16d66151 40ceb2c69964 ("usb: gadget: Fix compilation problem with tx_qlen field"). Signed-off-by: Amit Pundir --- drivers/usb/gadget/function/u_ether.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index cc5210a87614..21bf0a8423d5 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -66,7 +66,7 @@ struct eth_dev { spinlock_t req_lock; /* guard {rx,tx}_reqs */ struct list_head tx_reqs, rx_reqs; - unsigned tx_qlen; + atomic_t tx_qlen; /* Minimum number of TX USB request queued to UDC */ #define TX_REQ_THRESHOLD 5 int no_tx_req_used; @@ -568,6 +568,7 @@ static void tx_complete(struct usb_ep *ep, struct usb_request *req) dev_kfree_skb_any(skb); } + atomic_dec(&dev->tx_qlen); if (netif_carrier_ok(dev->net)) netif_wake_queue(dev->net); } @@ -746,7 +747,7 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb, req->no_interrupt = (((dev->gadget->speed == USB_SPEED_HIGH || dev->gadget->speed == USB_SPEED_SUPER)) && !list_empty(&dev->tx_reqs)) - ? ((dev->tx_qlen % dev->qmult) != 0) + ? ((atomic_read(&dev->tx_qlen) % dev->qmult) != 0) : 0; retval = usb_ep_queue(in, req, GFP_ATOMIC); @@ -756,6 +757,7 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb, break; case 0: net->trans_start = jiffies; + atomic_inc(&dev->tx_qlen); } if (retval) { @@ -784,7 +786,7 @@ static void eth_start(struct eth_dev *dev, gfp_t gfp_flags) rx_fill(dev, gfp_flags); /* and open the tx floodgates */ - dev->tx_qlen = 0; + atomic_set(&dev->tx_qlen, 0); netif_wake_queue(dev->net); } From d77312aeb21d93c3547f20b8cc09916daa928acc Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Tue, 25 Apr 2017 18:30:45 +0000 Subject: [PATCH 0774/3220] Revert "[RFC]cgroup: Change from CAP_SYS_NICE to CAP_SYS_RESOURCE for cgroup migration permissions" This reverts commit 64f0245f312a849acc83b7c6cdcc8f7c1a14cca3. Needs to be reverted since system_server no longer has CAP_SYS_RESOURCE. Change-Id: Ic500cffb14b80a3e2a5dd41fda0b0b781b5245d6 --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 253fdeabf667..8ab133d96435 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2689,7 +2689,7 @@ static int cgroup_procs_write_permission(struct task_struct *task, if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid) && - !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) + !ns_capable(tcred->user_ns, CAP_SYS_NICE)) ret = -EACCES; if (!ret && cgroup_on_dfl(dst_cgrp)) { From 6a6a7657c231e947233c43ae0522bbd4edf0139e Mon Sep 17 00:00:00 2001 From: David Zeuthen Date: Tue, 24 Jan 2017 13:02:35 -0500 Subject: [PATCH 0775/3220] ANDROID: Update init/do_mounts_dm.c to the latest ChromiumOS version. This is needed for AVB integration work. Bug: 31796270 Test: Manually tested (other arch). Change-Id: I32fd37c1578c6414e3e6ff277d16ad94df7886b8 Signed-off-by: David Zeuthen --- include/linux/device-mapper.h | 7 + init/do_mounts_dm.c | 562 ++++++++++++++++++---------------- 2 files changed, 310 insertions(+), 259 deletions(-) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index b874d5b61ffc..f3f87db34429 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -415,6 +415,13 @@ union map_info *dm_get_rq_mapinfo(struct request *rq); struct queue_limits *dm_get_queue_limits(struct mapped_device *md); +void dm_lock_md_type(struct mapped_device *md); +void dm_unlock_md_type(struct mapped_device *md); +void dm_set_md_type(struct mapped_device *md, unsigned type); +unsigned dm_get_md_type(struct mapped_device *md); +int dm_setup_md_queue(struct mapped_device *md); +unsigned dm_table_get_type(struct dm_table *t); + /* * Geometry functions. */ diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c index ecda58df9a19..bce1c2fbb915 100644 --- a/init/do_mounts_dm.c +++ b/init/do_mounts_dm.c @@ -5,13 +5,17 @@ * * This file is released under the GPL. */ +#include +#include #include #include #include +#include #include "do_mounts.h" -#include "../drivers/md/dm.h" +#define DM_MAX_DEVICES 256 +#define DM_MAX_TARGETS 256 #define DM_MAX_NAME 32 #define DM_MAX_UUID 129 #define DM_NO_UUID "none" @@ -19,14 +23,47 @@ #define DM_MSG_PREFIX "init" /* Separators used for parsing the dm= argument. */ -#define DM_FIELD_SEP ' ' -#define DM_LINE_SEP ',' +#define DM_FIELD_SEP " " +#define DM_LINE_SEP "," +#define DM_ANY_SEP DM_FIELD_SEP DM_LINE_SEP /* * When the device-mapper and any targets are compiled into the kernel - * (not a module), one target may be created and used as the root device at - * boot time with the parameters given with the boot line dm=... - * The code for that is here. + * (not a module), one or more device-mappers may be created and used + * as the root device at boot time with the parameters given with the + * boot line dm=... + * + * Multiple device-mappers can be stacked specifing the number of + * devices. A device can have multiple targets if the the number of + * targets is specified. + * + * TODO(taysom:defect 32847) + * In the future, the field will be mandatory. + * + * ::= [] + + * ::= "," + + * ::= [] + * ::= "," + * ::= "ro" | "rw" + * ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "none" + * ::= "verity" | "bootcache" | ... + * + * Example: + * 2 vboot none ro 1, + * 0 1768000 bootcache + * device=aa55b119-2a47-8c45-946a-5ac57765011f+1 + * signature=76e9be054b15884a9fa85973e9cb274c93afadb6 + * cache_start=1768000 max_blocks=100000 size_limit=23 max_trace=20000, + * vroot none ro 1, + * 0 1740800 verity payload=254:0 hashtree=254:0 hashstart=1740800 alg=sha1 + * root_hexdigest=76e9be054b15884a9fa85973e9cb274c93afadb6 + * salt=5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe + * + * Notes: + * 1. uuid is a label for the device and we set it to "none". + * 2. The field will be optional initially and assumed to be 1. + * Once all the scripts that set these fields have been set, it will + * be made mandatory. */ struct dm_setup_target { @@ -38,381 +75,388 @@ struct dm_setup_target { struct dm_setup_target *next; }; -static struct { +struct dm_device { int minor; int ro; char name[DM_MAX_NAME]; char uuid[DM_MAX_UUID]; - char *targets; + unsigned long num_targets; struct dm_setup_target *target; int target_count; + struct dm_device *next; +}; + +struct dm_option { + char *start; + char *next; + size_t len; + char delim; +}; + +static struct { + unsigned long num_devices; + char *str; } dm_setup_args __initdata; static __initdata int dm_early_setup; -static size_t __init get_dm_option(char *str, char **next, char sep) +static int __init get_dm_option(struct dm_option *opt, const char *accept) { - size_t len = 0; - char *endp = NULL; + char *str = opt->next; + char *endp; if (!str) return 0; - endp = strchr(str, sep); + str = skip_spaces(str); + opt->start = str; + endp = strpbrk(str, accept); if (!endp) { /* act like strchrnul */ - len = strlen(str); - endp = str + len; + opt->len = strlen(str); + endp = str + opt->len; } else { - len = endp - str; + opt->len = endp - str; } - - if (endp == str) - return 0; - - if (!next) - return len; - + opt->delim = *endp; if (*endp == 0) { /* Don't advance past the nul. */ - *next = endp; + opt->next = endp; } else { - *next = endp + 1; + opt->next = endp + 1; } - return len; + return opt->len != 0; } -static int __init dm_setup_args_init(void) +static int __init dm_setup_cleanup(struct dm_device *devices) { - dm_setup_args.minor = 0; - dm_setup_args.ro = 0; - dm_setup_args.target = NULL; - dm_setup_args.target_count = 0; + struct dm_device *dev = devices; + + while (dev) { + struct dm_device *old_dev = dev; + struct dm_setup_target *target = dev->target; + while (target) { + struct dm_setup_target *old_target = target; + kfree(target->type); + kfree(target->params); + target = target->next; + kfree(old_target); + dev->target_count--; + } + BUG_ON(dev->target_count); + dev = dev->next; + kfree(old_dev); + } return 0; } -static int __init dm_setup_cleanup(void) +static char * __init dm_parse_device(struct dm_device *dev, char *str) { - struct dm_setup_target *target = dm_setup_args.target; - struct dm_setup_target *old_target = NULL; - while (target) { - kfree(target->type); - kfree(target->params); - old_target = target; - target = target->next; - kfree(old_target); - dm_setup_args.target_count--; - } - BUG_ON(dm_setup_args.target_count); - return 0; -} - -static char * __init dm_setup_parse_device_args(char *str) -{ - char *next = NULL; - size_t len = 0; + struct dm_option opt; + size_t len; /* Grab the logical name of the device to be exported to udev */ - len = get_dm_option(str, &next, DM_FIELD_SEP); - if (!len) { + opt.next = str; + if (!get_dm_option(&opt, DM_FIELD_SEP)) { DMERR("failed to parse device name"); goto parse_fail; } - len = min(len + 1, sizeof(dm_setup_args.name)); - strlcpy(dm_setup_args.name, str, len); /* includes nul */ - str = skip_spaces(next); + len = min(opt.len + 1, sizeof(dev->name)); + strlcpy(dev->name, opt.start, len); /* includes nul */ /* Grab the UUID value or "none" */ - len = get_dm_option(str, &next, DM_FIELD_SEP); - if (!len) { + if (!get_dm_option(&opt, DM_FIELD_SEP)) { DMERR("failed to parse device uuid"); goto parse_fail; } - len = min(len + 1, sizeof(dm_setup_args.uuid)); - strlcpy(dm_setup_args.uuid, str, len); - str = skip_spaces(next); + len = min(opt.len + 1, sizeof(dev->uuid)); + strlcpy(dev->uuid, opt.start, len); /* Determine if the table/device will be read only or read-write */ - if (!strncmp("ro,", str, 3)) { - dm_setup_args.ro = 1; - } else if (!strncmp("rw,", str, 3)) { - dm_setup_args.ro = 0; + get_dm_option(&opt, DM_ANY_SEP); + if (!strncmp("ro", opt.start, opt.len)) { + dev->ro = 1; + } else if (!strncmp("rw", opt.start, opt.len)) { + dev->ro = 0; } else { DMERR("failed to parse table mode"); goto parse_fail; } - str = skip_spaces(str + 3); - return str; + /* Optional number field */ + /* XXX: The field will be mandatory in the next round */ + if (opt.delim == DM_FIELD_SEP[0]) { + if (!get_dm_option(&opt, DM_LINE_SEP)) + return NULL; + dev->num_targets = simple_strtoul(opt.start, NULL, 10); + } else { + dev->num_targets = 1; + } + if (dev->num_targets > DM_MAX_TARGETS) { + DMERR("too many targets %lu > %d", + dev->num_targets, DM_MAX_TARGETS); + } + return opt.next; parse_fail: return NULL; } -static void __init dm_substitute_devices(char *str, size_t str_len) +static char * __init dm_parse_targets(struct dm_device *dev, char *str) { - char *candidate = str; - char *candidate_end = str; - char old_char; - size_t len = 0; - dev_t dev; - - if (str_len < 3) - return; - - while (str && *str) { - candidate = strchr(str, '/'); - if (!candidate) - break; - - /* Avoid embedded slashes */ - if (candidate != str && *(candidate - 1) != DM_FIELD_SEP) { - str = strchr(candidate, DM_FIELD_SEP); - continue; - } - - len = get_dm_option(candidate, &candidate_end, DM_FIELD_SEP); - str = skip_spaces(candidate_end); - if (len < 3 || len > 37) /* name_to_dev_t max; maj:mix min */ - continue; - - /* Temporarily terminate with a nul */ - if (*candidate_end) - candidate_end--; - old_char = *candidate_end; - *candidate_end = '\0'; - - DMDEBUG("converting candidate device '%s' to dev_t", candidate); - /* Use the boot-time specific device naming */ - dev = name_to_dev_t(candidate); - *candidate_end = old_char; - - DMDEBUG(" -> %u", dev); - /* No suitable replacement found */ - if (!dev) - continue; - - /* Rewrite the /dev/path as a major:minor */ - len = snprintf(candidate, len, "%u:%u", MAJOR(dev), MINOR(dev)); - if (!len) { - DMERR("error substituting device major/minor."); - break; - } - candidate += len; - /* Pad out with spaces (fixing our nul) */ - while (candidate < candidate_end) - *(candidate++) = DM_FIELD_SEP; - } -} - -static int __init dm_setup_parse_targets(char *str) -{ - char *next = NULL; - size_t len = 0; - struct dm_setup_target **target = NULL; + struct dm_option opt; + struct dm_setup_target **target = &dev->target; + unsigned long num_targets = dev->num_targets; + unsigned long i; /* Targets are defined as per the table format but with a * comma as a newline separator. */ - target = &dm_setup_args.target; - while (str && *str) { + opt.next = str; + for (i = 0; i < num_targets; i++) { *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL); if (!*target) { - DMERR("failed to allocate memory for target %d", - dm_setup_args.target_count); + DMERR("failed to allocate memory for target %s<%ld>", + dev->name, i); goto parse_fail; } - dm_setup_args.target_count++; + dev->target_count++; - (*target)->begin = simple_strtoull(str, &next, 10); - if (!next || *next != DM_FIELD_SEP) { - DMERR("failed to parse starting sector for target %d", - dm_setup_args.target_count - 1); + if (!get_dm_option(&opt, DM_FIELD_SEP)) { + DMERR("failed to parse starting sector" + " for target %s<%ld>", dev->name, i); goto parse_fail; } - str = skip_spaces(next + 1); + (*target)->begin = simple_strtoull(opt.start, NULL, 10); - (*target)->length = simple_strtoull(str, &next, 10); - if (!next || *next != DM_FIELD_SEP) { - DMERR("failed to parse length for target %d", - dm_setup_args.target_count - 1); + if (!get_dm_option(&opt, DM_FIELD_SEP)) { + DMERR("failed to parse length for target %s<%ld>", + dev->name, i); goto parse_fail; } - str = skip_spaces(next + 1); + (*target)->length = simple_strtoull(opt.start, NULL, 10); - len = get_dm_option(str, &next, DM_FIELD_SEP); - if (!len || - !((*target)->type = kstrndup(str, len, GFP_KERNEL))) { - DMERR("failed to parse type for target %d", - dm_setup_args.target_count - 1); + if (get_dm_option(&opt, DM_FIELD_SEP)) + (*target)->type = kstrndup(opt.start, opt.len, + GFP_KERNEL); + if (!((*target)->type)) { + DMERR("failed to parse type for target %s<%ld>", + dev->name, i); goto parse_fail; } - str = skip_spaces(next); - - len = get_dm_option(str, &next, DM_LINE_SEP); - if (!len || - !((*target)->params = kstrndup(str, len, GFP_KERNEL))) { - DMERR("failed to parse params for target %d", - dm_setup_args.target_count - 1); + if (get_dm_option(&opt, DM_LINE_SEP)) + (*target)->params = kstrndup(opt.start, opt.len, + GFP_KERNEL); + if (!((*target)->params)) { + DMERR("failed to parse params for target %s<%ld>", + dev->name, i); goto parse_fail; } - str = skip_spaces(next); - - /* Before moving on, walk through the copied target and - * attempt to replace all /dev/xxx with the major:minor number. - * It may not be possible to resolve them traditionally at - * boot-time. */ - dm_substitute_devices((*target)->params, len); - target = &((*target)->next); } - DMDEBUG("parsed %d targets", dm_setup_args.target_count); + DMDEBUG("parsed %d targets", dev->target_count); - return 0; + return opt.next; parse_fail: - return 1; + return NULL; +} + +static struct dm_device * __init dm_parse_args(void) +{ + struct dm_device *devices = NULL; + struct dm_device **tail = &devices; + struct dm_device *dev; + char *str = dm_setup_args.str; + unsigned long num_devices = dm_setup_args.num_devices; + unsigned long i; + + if (!str) + return NULL; + for (i = 0; i < num_devices; i++) { + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + DMERR("failed to allocated memory for dev"); + goto error; + } + *tail = dev; + tail = &dev->next; + /* + * devices are given minor numbers 0 - n-1 + * in the order they are found in the arg + * string. + */ + dev->minor = i; + str = dm_parse_device(dev, str); + if (!str) /* NULL indicates error in parsing, bail */ + goto error; + + str = dm_parse_targets(dev, str); + if (!str) + goto error; + } + return devices; +error: + dm_setup_cleanup(devices); + return NULL; } /* * Parse the command-line parameters given our kernel, but do not * actually try to invoke the DM device now; that is handled by - * dm_setup_drive after the low-level disk drivers have initialised. - * dm format is as follows: - * dm="name uuid fmode,[table line 1],[table line 2],..." - * May be used with root=/dev/dm-0 as it always uses the first dm minor. + * dm_setup_drives after the low-level disk drivers have initialised. + * dm format is described at the top of the file. + * + * Because dm minor numbers are assigned in assending order starting with 0, + * You can assume the first device is /dev/dm-0, the next device is /dev/dm-1, + * and so forth. */ - static int __init dm_setup(char *str) { - dm_setup_args_init(); + struct dm_option opt; + unsigned long num_devices; - str = dm_setup_parse_device_args(str); if (!str) { DMDEBUG("str is NULL"); goto parse_fail; } - - /* Target parsing is delayed until we have dynamic memory */ - dm_setup_args.targets = str; - - printk(KERN_INFO "dm: will configure '%s' on dm-%d\n", - dm_setup_args.name, dm_setup_args.minor); - + opt.next = str; + if (!get_dm_option(&opt, DM_FIELD_SEP)) + goto parse_fail; + if (isdigit(opt.start[0])) { /* XXX: Optional number field */ + num_devices = simple_strtoul(opt.start, NULL, 10); + str = opt.next; + } else { + num_devices = 1; + /* Don't advance str */ + } + if (num_devices > DM_MAX_DEVICES) { + DMDEBUG("too many devices %lu > %d", + num_devices, DM_MAX_DEVICES); + } + dm_setup_args.str = str; + dm_setup_args.num_devices = num_devices; + DMINFO("will configure %lu devices", num_devices); dm_early_setup = 1; return 1; parse_fail: - printk(KERN_WARNING "dm: Invalid arguments supplied to dm=.\n"); + DMWARN("Invalid arguments supplied to dm=."); return 0; } - -static void __init dm_setup_drive(void) +static void __init dm_setup_drives(void) { struct mapped_device *md = NULL; struct dm_table *table = NULL; struct dm_setup_target *target; - char *uuid = dm_setup_args.uuid; + struct dm_device *dev; + char *uuid; fmode_t fmode = FMODE_READ; + struct dm_device *devices; - /* Finish parsing the targets. */ - if (dm_setup_parse_targets(dm_setup_args.targets)) - goto parse_fail; + devices = dm_parse_args(); - if (dm_create(dm_setup_args.minor, &md)) { - DMDEBUG("failed to create the device"); - goto dm_create_fail; - } - DMDEBUG("created device '%s'", dm_device_name(md)); - - /* In addition to flagging the table below, the disk must be - * set explicitly ro/rw. */ - set_disk_ro(dm_disk(md), dm_setup_args.ro); - - if (!dm_setup_args.ro) - fmode |= FMODE_WRITE; - if (dm_table_create(&table, fmode, dm_setup_args.target_count, md)) { - DMDEBUG("failed to create the table"); - goto dm_table_create_fail; - } - - dm_lock_md_type(md); - target = dm_setup_args.target; - while (target) { - DMINFO("adding target '%llu %llu %s %s'", - (unsigned long long) target->begin, - (unsigned long long) target->length, target->type, - target->params); - if (dm_table_add_target(table, target->type, target->begin, - target->length, target->params)) { - DMDEBUG("failed to add the target to the table"); - goto add_target_fail; + for (dev = devices; dev; dev = dev->next) { + if (dm_create(dev->minor, &md)) { + DMDEBUG("failed to create the device"); + goto dm_create_fail; } - target = target->next; - } + DMDEBUG("created device '%s'", dm_device_name(md)); - if (dm_table_complete(table)) { - DMDEBUG("failed to complete the table"); - goto table_complete_fail; - } + /* + * In addition to flagging the table below, the disk must be + * set explicitly ro/rw. + */ + set_disk_ro(dm_disk(md), dev->ro); - if (dm_get_md_type(md) == DM_TYPE_NONE) { + if (!dev->ro) + fmode |= FMODE_WRITE; + if (dm_table_create(&table, fmode, dev->target_count, md)) { + DMDEBUG("failed to create the table"); + goto dm_table_create_fail; + } + + dm_lock_md_type(md); + + for (target = dev->target; target; target = target->next) { + DMINFO("adding target '%llu %llu %s %s'", + (unsigned long long) target->begin, + (unsigned long long) target->length, + target->type, target->params); + if (dm_table_add_target(table, target->type, + target->begin, + target->length, + target->params)) { + DMDEBUG("failed to add the target" + " to the table"); + goto add_target_fail; + } + } + if (dm_table_complete(table)) { + DMDEBUG("failed to complete the table"); + goto table_complete_fail; + } + + /* Suspend the device so that we can bind it to the table. */ + if (dm_suspend(md, 0)) { + DMDEBUG("failed to suspend the device pre-bind"); + goto suspend_fail; + } + + /* Initial table load: acquire type of table. */ dm_set_md_type(md, dm_table_get_type(table)); + + /* Setup md->queue to reflect md's type. */ if (dm_setup_md_queue(md)) { DMWARN("unable to set up device queue for new table."); goto setup_md_queue_fail; } - } else if (dm_get_md_type(md) != dm_table_get_type(table)) { - DMWARN("can't change device type after initial table load."); - goto setup_md_queue_fail; - } - /* Suspend the device so that we can bind it to the table. */ - if (dm_suspend(md, 0)) { - DMDEBUG("failed to suspend the device pre-bind"); - goto suspend_fail; + /* + * Bind the table to the device. This is the only way + * to associate md->map with the table and set the disk + * capacity directly. + */ + if (dm_swap_table(md, table)) { /* should return NULL. */ + DMDEBUG("failed to bind the device to the table"); + goto table_bind_fail; + } + + /* Finally, resume and the device should be ready. */ + if (dm_resume(md)) { + DMDEBUG("failed to resume the device"); + goto resume_fail; + } + + /* Export the dm device via the ioctl interface */ + if (!strcmp(DM_NO_UUID, dev->uuid)) + uuid = NULL; + if (dm_ioctl_export(md, dev->name, uuid)) { + DMDEBUG("failed to export device with given" + " name and uuid"); + goto export_fail; + } + + dm_unlock_md_type(md); + + DMINFO("dm-%d is ready", dev->minor); } - - /* Bind the table to the device. This is the only way to associate - * md->map with the table and set the disk capacity directly. */ - if (dm_swap_table(md, table)) { /* should return NULL. */ - DMDEBUG("failed to bind the device to the table"); - goto table_bind_fail; - } - - /* Finally, resume and the device should be ready. */ - if (dm_resume(md)) { - DMDEBUG("failed to resume the device"); - goto resume_fail; - } - - /* Export the dm device via the ioctl interface */ - if (!strcmp(DM_NO_UUID, dm_setup_args.uuid)) - uuid = NULL; - if (dm_ioctl_export(md, dm_setup_args.name, uuid)) { - DMDEBUG("failed to export device with given name and uuid"); - goto export_fail; - } - printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor); - - dm_unlock_md_type(md); - dm_setup_cleanup(); + dm_setup_cleanup(devices); return; export_fail: resume_fail: table_bind_fail: -suspend_fail: setup_md_queue_fail: +suspend_fail: table_complete_fail: add_target_fail: dm_unlock_md_type(md); dm_table_create_fail: dm_put(md); dm_create_fail: - dm_setup_cleanup(); -parse_fail: - printk(KERN_WARNING "dm: starting dm-%d (%s) failed\n", - dm_setup_args.minor, dm_setup_args.name); + DMWARN("starting dm-%d (%s) failed", + dev->minor, dev->name); + dm_setup_cleanup(devices); } __setup("dm=", dm_setup); @@ -421,6 +465,6 @@ void __init dm_run_setup(void) { if (!dm_early_setup) return; - printk(KERN_INFO "dm: attempting early device configuration.\n"); - dm_setup_drive(); + DMINFO("attempting early device configuration."); + dm_setup_drives(); } From 8d6f006d608c3b03652fb919e496945f2d4d4f1d Mon Sep 17 00:00:00 2001 From: David Zeuthen Date: Tue, 24 Jan 2017 13:17:01 -0500 Subject: [PATCH 0776/3220] ANDROID: AVB error handler to invalidate vbmeta partition. If androidboot.vbmeta.device is set and points to a device with vbmeta magic, this header will be overwritten upon an irrecoverable dm-verity error. The side-effect of this is that the slot will fail to verify on next reboot, effectively triggering the boot loader to fallback to another slot. This work both if the vbmeta struct is at the start of a partition or if there's an AVB footer at the end. This code is based on drivers/md/dm-verity-chromeos.c from ChromiumOS. Example: [ 0.000000] Kernel command line: rootfstype=ext4 init=/init console=ttyS0,115200 androidboot.console=ttyS0 androidboot.hardware=uefi_x86_64 enforcing=0 androidboot.selinux=permissive androidboot.debuggable=1 buildvariant=eng dm="1 vroot none ro 1,0 2080496 verity 1 PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b 4096 4096 260062 260062 sha1 4f76354c86e430e27426d584a726f2fbffecae32 7e4085342d634065269631ac9a199e1a43f4632c 1 ignore_zero_blocks" root=0xfd00 androidboot.vbmeta.device=PARTUUID=b865935d-38fb-4c4e-b8b4-70dc67321552 androidboot.slot_suffix=_a androidboot.vbmeta.device_state=unlocked androidboot.vbmeta.hash_alg=sha256 androidboot.vbmeta.size=3200 androidboot.vbmeta.digest=14fe41c2b3696c31b7ad5eae7877d7d188995e1ab122c604aaaf4785850b91f7 skip_initramfs [...] [ 0.612802] device-mapper: verity-avb: AVB error handler initialized with vbmeta device: PARTUUID=b865935d-38fb-4c4e-b8b4-70dc67321552 [...] [ 1.213804] device-mapper: init: attempting early device configuration. [ 1.214752] device-mapper: init: adding target '0 2080496 verity 1 PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b PARTUUID=6779df46-78f6-4c69-bf53-59bb1fbf126b 4096 4096 260062 260062 sha1 4f76354c86e430e27426d584a726f2fbffecae32 7e4085342d634065269631ac9a199e1a43f4632c 1 ignore_zero_blocks' [ 1.217643] device-mapper: init: dm-0 is ready [ 1.226694] device-mapper: verity: 8:6: data block 0 is corrupted [ 1.227666] device-mapper: verity-avb: AVB error handler called for PARTUUID=b865935d-38fb-4c4e-b8b4-70dc67321552 [ 1.234308] device-mapper: verity-avb: invalidate_vbmeta: found vbmeta partition [ 1.235848] device-mapper: verity-avb: invalidate_vbmeta: completed. [...] Bug: 31622239 Test: Manually tested (other arch). Change-Id: Idf6be32d6a3d28e15de9302aa26ad6a516d663aa Signed-off-by: David Zeuthen --- drivers/md/Kconfig | 11 ++ drivers/md/Makefile | 4 + drivers/md/dm-verity-avb.c | 217 ++++++++++++++++++++++++++++++++++ drivers/md/dm-verity-target.c | 6 +- drivers/md/dm-verity.h | 1 + 5 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 drivers/md/dm-verity-avb.c diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 9eb08a43cd27..6f2fde5d98e7 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -515,6 +515,17 @@ config DM_LOG_WRITES If unsure, say N. +config DM_VERITY_AVB + tristate "Support AVB specific verity error behavior" + depends on DM_VERITY + ---help--- + Enables Android Verified Boot platform-specific error + behavior. In particular, it will modify the vbmeta partition + specified on the kernel command-line when non-transient error + occurs (followed by a panic). + + If unsure, say N. + config DM_ANDROID_VERITY bool "Android verity target support" depends on DM_VERITY=y diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 32b5d0a90d60..c22cc74c9fa8 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -69,3 +69,7 @@ endif ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif + +ifeq ($(CONFIG_DM_VERITY_AVB),y) +dm-verity-objs += dm-verity-avb.o +endif diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c new file mode 100644 index 000000000000..88487346c4c6 --- /dev/null +++ b/drivers/md/dm-verity-avb.c @@ -0,0 +1,217 @@ +/* + * Copyright (C) 2017 Google. + * + * This file is released under the GPLv2. + * + * Based on drivers/md/dm-verity-chromeos.c + */ + +#include +#include +#include + +#define DM_MSG_PREFIX "verity-avb" + +/* Set via module parameter. */ +static char avb_vbmeta_device[64]; + +static void invalidate_vbmeta_endio(struct bio *bio) +{ + complete(bio->bi_private); +} + +static int invalidate_vbmeta_submit(struct bio *bio, + struct block_device *bdev, + int rw, int access_last_sector, + struct page *page) +{ + DECLARE_COMPLETION_ONSTACK(wait); + + bio->bi_private = &wait; + bio->bi_end_io = invalidate_vbmeta_endio; + bio->bi_bdev = bdev; + + bio->bi_iter.bi_sector = 0; + if (access_last_sector) { + sector_t last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1; + bio->bi_iter.bi_sector = last_sector; + } + bio->bi_vcnt = 1; + bio->bi_iter.bi_idx = 0; + bio->bi_iter.bi_size = 512; + bio->bi_iter.bi_bvec_done = 0; + bio->bi_rw = rw; + bio->bi_io_vec[0].bv_page = page; + bio->bi_io_vec[0].bv_len = 512; + bio->bi_io_vec[0].bv_offset = 0; + + submit_bio(rw, bio); + /* Wait up to 2 seconds for completion or fail. */ + if (!wait_for_completion_timeout(&wait, msecs_to_jiffies(2000))) + return -EIO; + return 0; +} + +static int invalidate_vbmeta(dev_t vbmeta_devt) +{ + int ret = 0; + struct block_device *bdev; + struct bio *bio; + struct page *page; + fmode_t dev_mode; + /* Ensure we do synchronous unblocked I/O. We may also need + * sync_bdev() on completion, but it really shouldn't. + */ + int rw = REQ_SYNC | REQ_SOFTBARRIER | REQ_NOIDLE; + int access_last_sector = 0; + + /* First we open the device for reading. */ + dev_mode = FMODE_READ | FMODE_EXCL; + bdev = blkdev_get_by_dev(vbmeta_devt, dev_mode, + invalidate_vbmeta); + if (IS_ERR(bdev)) { + DMERR("invalidate_kernel: could not open device for reading"); + dev_mode = 0; + ret = -ENOENT; + goto failed_to_read; + } + + bio = bio_alloc(GFP_NOIO, 1); + if (!bio) { + ret = -ENOMEM; + goto failed_bio_alloc; + } + + page = alloc_page(GFP_NOIO); + if (!page) { + ret = -ENOMEM; + goto failed_to_alloc_page; + } + + access_last_sector = 0; + ret = invalidate_vbmeta_submit(bio, bdev, rw, access_last_sector, page); + if (ret) { + DMERR("invalidate_vbmeta: error reading"); + goto failed_to_submit_read; + } + + /* We have a page. Let's make sure it looks right. */ + if (memcmp("AVB0", page_address(page), 4) == 0) { + /* Stamp it. */ + memcpy(page_address(page), "AVE0", 4); + DMINFO("invalidate_vbmeta: found vbmeta partition"); + } else { + /* Could be this is on a AVB footer, check. Also, since the + * AVB footer is in the last 64 bytes, adjust for the fact that + * we're dealing with 512-byte sectors. + */ + size_t offset = (1<bi_remaining. + */ + bio_reset(bio); + + rw |= REQ_WRITE; + ret = invalidate_vbmeta_submit(bio, bdev, rw, access_last_sector, page); + if (ret) { + DMERR("invalidate_vbmeta: error writing"); + goto failed_to_submit_write; + } + + DMERR("invalidate_vbmeta: completed."); + ret = 0; +failed_to_submit_write: +failed_to_write: +invalid_header: + __free_page(page); +failed_to_submit_read: + /* Technically, we'll leak a page with the pending bio, but + * we're about to reboot anyway. + */ +failed_to_alloc_page: + bio_put(bio); +failed_bio_alloc: + if (dev_mode) + blkdev_put(bdev, dev_mode); +failed_to_read: + return ret; +} + +void dm_verity_avb_error_handler(void) +{ + dev_t dev; + + DMINFO("AVB error handler called for %s", avb_vbmeta_device); + + if (avb_vbmeta_device[0] == '\0') { + DMERR("avb_vbmeta_device parameter not set"); + goto fail_no_dev; + } + + dev = name_to_dev_t(avb_vbmeta_device); + if (!dev) { + DMERR("No matching partition for device: %s", + avb_vbmeta_device); + goto fail_no_dev; + } + + invalidate_vbmeta(dev); + +fail_no_dev: + ; +} + +static int __init dm_verity_avb_init(void) +{ + DMINFO("AVB error handler initialized with vbmeta device: %s", + avb_vbmeta_device); + return 0; +} + +static void __exit dm_verity_avb_exit(void) +{ +} + +module_init(dm_verity_avb_init); +module_exit(dm_verity_avb_exit); + +MODULE_AUTHOR("David Zeuthen "); +MODULE_DESCRIPTION("AVB-specific error handler for dm-verity"); +MODULE_LICENSE("GPL"); + +/* Declare parameter with no module prefix */ +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "androidboot.vbmeta." +module_param_string(device, avb_vbmeta_device, sizeof(avb_vbmeta_device), 0); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index c7e97cf6e7fb..e34cf53bd068 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -233,8 +233,12 @@ out: if (v->mode == DM_VERITY_MODE_LOGGING) return 0; - if (v->mode == DM_VERITY_MODE_RESTART) + if (v->mode == DM_VERITY_MODE_RESTART) { +#ifdef CONFIG_DM_VERITY_AVB + dm_verity_avb_error_handler(); +#endif kernel_restart("dm-verity device corrupted"); + } return 1; } diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 75effca400a3..a90d1d416107 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -136,4 +136,5 @@ extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits); extern void verity_dtr(struct dm_target *ti); extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv); extern int verity_map(struct dm_target *ti, struct bio *bio); +extern void dm_verity_avb_error_handler(void); #endif /* DM_VERITY_H */ From 0f3b6e26eb2ebb0c66f20ed53134e26496c62dd7 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 20 Apr 2017 18:05:02 -0700 Subject: [PATCH 0777/3220] ANDROID: sdcardfs: Use filesystem specific hash We weren't accounting for FS specific hash functions, causing us to miss negative dentries for any FS that had one. Similar to a patch from esdfs commit 75bd25a9476d ("esdfs: support lower's own hash") Signed-off-by: Daniel Rosenberg Change-Id: I32d1ba304d728e0ca2648cacfb4c2e441ae63608 --- fs/sdcardfs/lookup.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index a0f221501b4c..446ef4027ebc 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -366,8 +366,13 @@ put_name: /* instatiate a new negative dentry */ dname.name = name->name; dname.len = name->len; - dname.hash = full_name_hash(dname.name, dname.len); - lower_dentry = d_lookup(lower_dir_dentry, &dname); + + /* See if the low-level filesystem might want + * to use its own hash + */ + lower_dentry = d_hash_and_lookup(lower_dir_dentry, &dname); + if (IS_ERR(lower_dentry)) + return lower_dentry; if (lower_dentry) goto setup_lower; From b4840d3bba899cd1f2434fb9f1277b36673a4b3e Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 20 Apr 2017 18:21:50 -0700 Subject: [PATCH 0778/3220] Revert "Revert "Android: sdcardfs: Don't do d_add for lower fs"" This reverts commit ffa75fdb9c408f49b9622b6d55752ed99ff61488. Turns out we just needed the right hash. Signed-off-by: Daniel Rosenberg Bug: 37231161 Change-Id: I6a6de7f7df99ad42b20fa062913b219f64020c31 --- fs/sdcardfs/lookup.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 446ef4027ebc..509d5fbcb472 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -373,17 +373,15 @@ put_name: lower_dentry = d_hash_and_lookup(lower_dir_dentry, &dname); if (IS_ERR(lower_dentry)) return lower_dentry; - if (lower_dentry) - goto setup_lower; - - lower_dentry = d_alloc(lower_dir_dentry, &dname); if (!lower_dentry) { - err = -ENOMEM; + /* We called vfs_path_lookup earlier, and did not get a negative + * dentry then. Don't confuse the lower filesystem by forcing + * one on it now... + */ + err = -ENOENT; goto out; } - d_add(lower_dentry, NULL); /* instantiate and hash */ -setup_lower: lower_path.dentry = lower_dentry; lower_path.mnt = mntget(lower_dir_mnt); sdcardfs_set_lower_path(dentry, &lower_path); From 46d925efcc72a2b52d404ad8a457a5bdc7c18b83 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 24 Apr 2017 16:10:21 -0700 Subject: [PATCH 0779/3220] ANDROID: sdcardfs: Copy meta-data from lower inode From wrapfs commit 3ee9b365e38c ("Wrapfs: properly copy meta-data after AIO operations from lower inode") Signed-off-by: Daniel Rosenberg Bug: 35766959 Change-Id: I9a789222e27a17b8d85ce61c45397d1839f9a675 --- fs/sdcardfs/file.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 1f6921e2ffbf..6076c342dae6 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -358,9 +358,12 @@ ssize_t sdcardfs_read_iter(struct kiocb *iocb, struct iov_iter *iter) get_file(lower_file); /* prevent lower_file from being released */ iocb->ki_filp = lower_file; err = lower_file->f_op->read_iter(iocb, iter); - /* ? wait IO finish to update atime as ecryptfs ? */ iocb->ki_filp = file; fput(lower_file); + /* update upper inode atime as needed */ + if (err >= 0 || err == -EIOCBQUEUED) + fsstack_copy_attr_atime(file->f_path.dentry->d_inode, + file_inode(lower_file)); out: return err; } @@ -384,6 +387,13 @@ ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter) err = lower_file->f_op->write_iter(iocb, iter); iocb->ki_filp = file; fput(lower_file); + /* update upper inode times/sizes as needed */ + if (err >= 0 || err == -EIOCBQUEUED) { + fsstack_copy_inode_size(file->f_path.dentry->d_inode, + file_inode(lower_file)); + fsstack_copy_attr_times(file->f_path.dentry->d_inode, + file_inode(lower_file)); + } out: return err; } From 33fddbee41d511bd1f1946f3dd66351d6895c30f Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 24 Apr 2017 16:11:03 -0700 Subject: [PATCH 0780/3220] ANDROID: sdcardfs: Avoid setting GIDs outside of valid ranges When setting up the ownership of files on the lower filesystem, ensure that these values are in reasonable ranges for apps. If they aren't, default to AID_MEDIA_RW Signed-off-by: Daniel Rosenberg Bug: 37516160 Change-Id: I0bec76a61ac72aff0b993ab1ad04be8382178a00 --- fs/sdcardfs/derived_perm.c | 8 ++++---- fs/sdcardfs/multiuser.h | 7 +++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index cddfc79ea365..b4595aab5713 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -215,16 +215,16 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name) gid = AID_MEDIA_OBB; break; case PERM_ANDROID_PACKAGE: - if (info->d_uid != 0) + if (uid_is_app(info->d_uid)) gid = multiuser_get_ext_gid(info->d_uid); else - gid = multiuser_get_uid(info->userid, uid); + gid = multiuser_get_uid(info->userid, AID_MEDIA_RW); break; case PERM_ANDROID_PACKAGE_CACHE: - if (info->d_uid != 0) + if (uid_is_app(info->d_uid)) gid = multiuser_get_ext_cache_gid(info->d_uid); else - gid = multiuser_get_uid(info->userid, uid); + gid = multiuser_get_uid(info->userid, AID_MEDIA_RW); break; case PERM_PRE_ROOT: default: diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h index d0c925cda299..85341e753f8c 100644 --- a/fs/sdcardfs/multiuser.h +++ b/fs/sdcardfs/multiuser.h @@ -35,6 +35,13 @@ static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id) return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET); } +static inline bool uid_is_app(uid_t uid) +{ + appid_t appid = uid % AID_USER_OFFSET; + + return appid >= AID_APP_START && appid <= AID_APP_END; +} + static inline gid_t multiuser_get_ext_cache_gid(uid_t uid) { return uid - AID_APP_START + AID_EXT_CACHE_GID_START; From b878b260109386898ece2fc99ff2da255acd9c10 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 24 Apr 2017 19:49:02 -0700 Subject: [PATCH 0781/3220] ANDROID: sdcardfs: Call lower fs's revalidate We should be calling the lower filesystem's revalidate inside of sdcardfs's revalidate, as wrapfs does. Signed-off-by: Daniel Rosenberg Bug: 35766959 Change-Id: I939d1c4192fafc1e21678aeab43fe3d588b8e2f4 --- fs/sdcardfs/dentry.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index 8e31d1a80f0c..7a19e77fce99 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -60,6 +60,14 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags) lower_dentry = lower_path.dentry; lower_cur_parent_dentry = dget_parent(lower_dentry); + if ((lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) { + err = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (err == 0) { + d_drop(dentry); + goto out; + } + } + spin_lock(&lower_dentry->d_lock); if (d_unhashed(lower_dentry)) { spin_unlock(&lower_dentry->d_lock); From 3f0531e5775303091a1ff975cdd572cc6a935321 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Mon, 24 Apr 2017 18:20:52 -0700 Subject: [PATCH 0782/3220] BACKPORT: f2fs: sanity check log_blocks_per_seg f2fs currently only supports 4KB block size and 2MB segment size. Sanity check log_blocks_per_seg == 9, i.e. 2MB/4KB = (1 << 9) Partially (cherry-picked from commit 9a59b62fd88196844cee5fff851bee2cfd7afb6e) f2fs: do more integrity verification for superblock Do more sanity check for superblock during ->mount. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Bug: 36817013 Change-Id: I0be52e54fba82083068337ceb9f7ad985a87319f Signed-off-by: Jin Qian --- fs/f2fs/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a65e0132352..98a77b0a365d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -947,6 +947,14 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_msg(sb, KERN_INFO, + "Invalid log blocks per segment (%u)\n", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return 1; + } + /* Currently, support 512/1024/2048/4096 bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || From 90d78776c4a0e13fb7ee5bd0787f04a1730631a6 Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Tue, 25 Apr 2017 18:07:43 +0800 Subject: [PATCH 0783/3220] ANDROID: uid_sys_stats: fix access of task_uid(task) struct task_struct *task should be proteced by tasklist_lock. Change-Id: Iefcd13442a9b9d855a2bbcde9fd838a4132fee58 Signed-off-by: Ganesh Mahendran --- drivers/misc/uid_sys_stats.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 618617f3e867..ad21276c8d9e 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -95,9 +95,11 @@ static int uid_cputime_show(struct seq_file *m, void *v) { struct uid_entry *uid_entry; struct task_struct *task, *temp; + struct user_namespace *user_ns = current_user_ns(); cputime_t utime; cputime_t stime; unsigned long bkt; + uid_t uid; rt_mutex_lock(&uid_lock); @@ -108,14 +110,13 @@ static int uid_cputime_show(struct seq_file *m, void *v) read_lock(&tasklist_lock); do_each_thread(temp, task) { - uid_entry = find_or_register_uid(from_kuid_munged( - current_user_ns(), task_uid(task))); + uid = from_kuid_munged(user_ns, task_uid(task)); + uid_entry = find_or_register_uid(uid); if (!uid_entry) { read_unlock(&tasklist_lock); rt_mutex_unlock(&uid_lock); pr_err("%s: failed to find the uid_entry for uid %d\n", - __func__, from_kuid_munged(current_user_ns(), - task_uid(task))); + __func__, uid); return -ENOMEM; } task_cputime_adjusted(task, &utime, &stime); From 1f48a715af37ceffd86ee7a5f693dd2ca2dda892 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Fri, 28 Apr 2017 12:53:04 -0600 Subject: [PATCH 0784/3220] net: pppolac/pppopns: Add back the msg_flags Commit 26fc40a09221330 ("net: pppolac/pppopns: Replace msg.msg_iov with iov_iter_kvec()") removed the msg_flags when removing the iov fields. This lead to problems with VPN data transfers. Change-Id: Ib86ab3f927c5cf36cbad0bab501575999dc2b084 Fixes: 26fc40a09221330 ("net: pppolac/pppopns: Replace msg.msg_iov with iov_iter_kvec()") Signed-off-by: Subash Abhinov Kasiviswanathan --- drivers/net/ppp/pppolac.c | 4 +++- drivers/net/ppp/pppopns.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ppp/pppolac.c b/drivers/net/ppp/pppolac.c index 0184c96579e9..3a45cf805288 100644 --- a/drivers/net/ppp/pppolac.c +++ b/drivers/net/ppp/pppolac.c @@ -206,7 +206,9 @@ static void pppolac_xmit_core(struct work_struct *delivery_work) while ((skb = skb_dequeue(&delivery_queue))) { struct sock *sk_udp = skb->sk; struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len}; - struct msghdr msg = { 0 }; + struct msghdr msg = { + .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT, + }; iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1, skb->len); diff --git a/drivers/net/ppp/pppopns.c b/drivers/net/ppp/pppopns.c index d9e06039794e..cdb4fa1af734 100644 --- a/drivers/net/ppp/pppopns.c +++ b/drivers/net/ppp/pppopns.c @@ -189,7 +189,9 @@ static void pppopns_xmit_core(struct work_struct *delivery_work) while ((skb = skb_dequeue(&delivery_queue))) { struct sock *sk_raw = skb->sk; struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len}; - struct msghdr msg = { 0 }; + struct msghdr msg = { + .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT, + }; iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1, skb->len); From 69a14c17de7edbaa354d56bbd25718a6e2cd6089 Mon Sep 17 00:00:00 2001 From: Witold Sciuk Date: Sat, 13 Feb 2016 11:08:37 +0100 Subject: [PATCH 0785/3220] ANDROID: usb: gadget: f_mtp: Set 0xFFFFFFFF in mtp header ContainerLength field Value 0xFFFFFFFF should be set according specification of MTP for large files when fileSize + mtpHeader is greater than 0xFFFFFFFF. MTP Specification, Appendix H - USB Optimizations Patchset: mtp Change-Id: I6213de052914350be2f87b73f8135f9c0cd05d7c Signed-off-by: Witold Sciuk Signed-off-by: Konrad Leszczynski Signed-off-by: Russ Weight --- drivers/usb/gadget/function/f_mtp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index b37cc8d87ca4..0fa79b0c26ab 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -43,6 +43,7 @@ #define MTP_BULK_BUFFER_SIZE 16384 #define INTR_BUFFER_SIZE 28 #define MAX_INST_NAME_LEN 40 +#define MTP_MAX_FILE_SIZE 0xFFFFFFFFL /* String IDs */ #define INTERFACE_STRING_INDEX 0 @@ -766,7 +767,12 @@ static void send_file_work(struct work_struct *data) if (hdr_size) { /* prepend MTP data header */ header = (struct mtp_data_header *)req->buf; - header->length = __cpu_to_le32(count); + /* + * set file size with header according to + * MTP Specification v1.0 + */ + header->length = (count > MTP_MAX_FILE_SIZE) ? + MTP_MAX_FILE_SIZE : __cpu_to_le32(count); header->type = __cpu_to_le16(2); /* data packet */ header->command = __cpu_to_le16(dev->xfer_command); header->transaction_id = From 4f8785184972bf230383dfbbda3e1c0b21dc2312 Mon Sep 17 00:00:00 2001 From: Konrad Leszczynski Date: Wed, 24 Feb 2016 10:47:12 +0000 Subject: [PATCH 0786/3220] ANDROID: usb: gadget: f_audio_source: disable the CPU C-states upon playback Due to the issue with the isoc transfers being interrupted by the CPU going into the idle state, the C-states will be disabled for the playback time. Change-Id: If4e52673606923d7e33a1d1dbe0192b8ad24f78c Signed-off-by: Konrad Leszczynski Signed-off-by: Russ Weight --- drivers/usb/gadget/function/f_audio_source.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c index db7903d19c43..8124af33b738 100644 --- a/drivers/usb/gadget/function/f_audio_source.c +++ b/drivers/usb/gadget/function/f_audio_source.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -268,6 +269,8 @@ struct audio_dev { /* number of frames sent since start_time */ s64 frames_sent; struct audio_source_config *config; + /* for creating and issuing QoS requests */ + struct pm_qos_request pm_qos; }; static inline struct audio_dev *func_to_audio(struct usb_function *f) @@ -740,6 +743,10 @@ static int audio_pcm_open(struct snd_pcm_substream *substream) runtime->hw.channels_max = 2; audio->substream = substream; + + /* Add the QoS request and set the latency to 0 */ + pm_qos_add_request(&audio->pm_qos, PM_QOS_CPU_DMA_LATENCY, 0); + return 0; } @@ -749,6 +756,10 @@ static int audio_pcm_close(struct snd_pcm_substream *substream) unsigned long flags; spin_lock_irqsave(&audio->lock, flags); + + /* Remove the QoS request */ + pm_qos_remove_request(&audio->pm_qos); + audio->substream = NULL; spin_unlock_irqrestore(&audio->lock, flags); From 3c72862e8f645c2e2a936646e3568b45e44c268a Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Fri, 5 May 2017 01:15:33 +0530 Subject: [PATCH 0787/3220] ANDROID: arm64: suspend: Restore the UAO state Upstream commit d08544127d9f ("arm64: suspend: Reconfigure PSTATE after resume from idle") when cherry-picked on LTS linux-4.4.y removed UAO reset code because UAO is not supported in linux-4.4.y. But common/android-4.4 has UAO support, added in Change-Id: I1a6a74a1f33b92d54368bd99387b55cf62930903 ("UPSTREAM: arm64: kernel: Add support for User Access Override"). This patch pick up the UAO specific changes of upstream commit dropped in LTS cherry-pick. Fixes: LTS commit 71710cd35a55 ("arm64: suspend: Reconfigure PSTATE after resume from idle") Signed-off-by: Amit Pundir --- arch/arm64/include/asm/exec.h | 3 +++ arch/arm64/kernel/process.c | 3 ++- arch/arm64/kernel/suspend.c | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/exec.h b/arch/arm64/include/asm/exec.h index db0563c23482..f7865dd9d868 100644 --- a/arch/arm64/include/asm/exec.h +++ b/arch/arm64/include/asm/exec.h @@ -18,6 +18,9 @@ #ifndef __ASM_EXEC_H #define __ASM_EXEC_H +#include + extern unsigned long arch_align_stack(unsigned long sp); +void uao_thread_switch(struct task_struct *next); #endif /* __ASM_EXEC_H */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 6f3fb46170bf..e6afea67f6c1 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -379,7 +380,7 @@ static void tls_thread_switch(struct task_struct *next) } /* Restore the UAO state depending on next's addr_limit */ -static void uao_thread_switch(struct task_struct *next) +void uao_thread_switch(struct task_struct *next) { if (IS_ENABLED(CONFIG_ARM64_UAO)) { if (task_thread_info(next)->addr_limit == KERNEL_DS) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 20b6b9b02cc8..f42b8b8f1d0a 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -106,6 +107,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) */ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)); + uao_thread_switch(current); /* * Restore HW breakpoint registers to sane values From 40d8db5ddf18c8415a3d789515e14d8ddbb638ee Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Fri, 5 May 2017 18:36:51 -0700 Subject: [PATCH 0788/3220] ANDROID: android-base.cfg: remove defunct options INET6_DIAG_DESTROY and NETFILTER_TPROXY are not used anymore so they should not be part of the base Android kernel configuration. Bug: 37749708 Change-Id: Iab263a5723f1810e2133919b8db93cc2bb986624 Signed-off-by: Steve Muckle --- android/configs/android-base.cfg | 2 -- 1 file changed, 2 deletions(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 30e01074f64a..748ccf37ca3e 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -29,7 +29,6 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_INET6_AH=y -CONFIG_INET6_DIAG_DESTROY=y CONFIG_INET6_ESP=y CONFIG_INET6_IPCOMP=y CONFIG_INET=y @@ -72,7 +71,6 @@ CONFIG_MODVERSIONS=y CONFIG_NET=y CONFIG_NETDEVICES=y CONFIG_NETFILTER=y -CONFIG_NETFILTER_TPROXY=y CONFIG_NETFILTER_XT_MATCH_COMMENT=y CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y CONFIG_NETFILTER_XT_MATCH_CONNMARK=y From cc756e682c1a296acd14471d122646c92f6936f3 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Sun, 7 May 2017 14:03:43 -0700 Subject: [PATCH 0789/3220] ANDROID: android-base.cfg: remove USB_OTG_WAKELOCK CONFIG_USB_OTG_WAKELOCK is currently somewhat outdated and as such is not applicable to all Android devices. Until it is brought up to date, remove it from the base Android kernel configuration. Bug: 37750863 Change-Id: I5b1c0bef24476cc503a60003bf48ffb59eea8c94 Signed-off-by: Steve Muckle --- android/configs/android-base.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 748ccf37ca3e..3c32c74c584a 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -171,5 +171,4 @@ CONFIG_USB_CONFIGFS_F_MTP=y CONFIG_USB_CONFIGFS_F_PTP=y CONFIG_USB_CONFIGFS_UEVENT=y CONFIG_USB_GADGET=y -CONFIG_USB_OTG_WAKELOCK=y CONFIG_XFRM_USER=y From b5428181f94bd0d1ff5d321bd58413bdb61f8df4 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 19 Apr 2017 14:22:47 -0700 Subject: [PATCH 0790/3220] ANDROID: Add untag hacks to inet_release function To prevent protential risk of memory leak caused by closing socket with out untag it from qtaguid module, the qtaguid module now do not hold any socket file reference count. Instead, it will increase the sk_refcnt of the sk struct to prevent a reuse of the socket pointer. And when a socket is released. It will delete the tag if the socket is previously tagged so no more resources is held by xt_qtaguid moudle. A flag is added to the untag process to prevent possible kernel crash caused by fail to delete corresponding socket_tag_entry list. Bug: 36374484 Test: compile and run test under system/extra/test/iptables, run cts -m CtsNetTestCases -t android.net.cts.SocketRefCntTest Signed-off-by: Chenbo Feng Change-Id: Iea7c3bf0c59b9774a5114af905b2405f6bc9ee52 --- include/linux/netfilter/xt_qtaguid.h | 1 + net/ipv4/af_inet.c | 4 + net/netfilter/xt_qtaguid.c | 107 ++++++++++++++------------- net/netfilter/xt_qtaguid_internal.h | 2 - net/netfilter/xt_qtaguid_print.c | 8 +- 5 files changed, 63 insertions(+), 59 deletions(-) diff --git a/include/linux/netfilter/xt_qtaguid.h b/include/linux/netfilter/xt_qtaguid.h index ca60fbdec2f3..1c671552ec37 100644 --- a/include/linux/netfilter/xt_qtaguid.h +++ b/include/linux/netfilter/xt_qtaguid.h @@ -10,4 +10,5 @@ #define XT_QTAGUID_SOCKET XT_OWNER_SOCKET #define xt_qtaguid_match_info xt_owner_match_info +int qtaguid_untag(struct socket *sock, bool kernel); #endif /* _XT_QTAGUID_MATCH_H */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 68bf7bdf7fdb..eb5cfc1478b1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -89,6 +89,7 @@ #include #include #include +#include #include @@ -413,6 +414,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; +#ifdef CONFIG_NETFILTER_XT_MATCH_QTAGUID + qtaguid_untag(sock, true); +#endif /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index 0f5628a59917..f95aba44e649 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -321,7 +321,7 @@ static void sock_tag_tree_erase(struct rb_root *st_to_free_tree) st_entry->tag, get_uid_from_tag(st_entry->tag)); rb_erase(&st_entry->sock_node, st_to_free_tree); - sockfd_put(st_entry->socket); + sock_put(st_entry->sk); kfree(st_entry); } } @@ -1929,12 +1929,12 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) { struct sock_tag *sock_tag_entry = v; uid_t uid; - long f_count; CT_DEBUG("qtaguid: proc ctrl pid=%u tgid=%u uid=%u\n", current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); if (sock_tag_entry != SEQ_START_TOKEN) { + int sk_ref_count; uid = get_uid_from_tag(sock_tag_entry->tag); CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) " "pid=%u\n", @@ -1943,13 +1943,13 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) uid, sock_tag_entry->pid ); - f_count = atomic_long_read( - &sock_tag_entry->socket->file->f_count); + sk_ref_count = atomic_read( + &sock_tag_entry->sk->sk_refcnt); seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u " - "f_count=%lu\n", + "f_count=%d\n", sock_tag_entry->sk, sock_tag_entry->tag, uid, - sock_tag_entry->pid, f_count); + sock_tag_entry->pid, sk_ref_count); } else { seq_printf(m, "events: sockets_tagged=%llu " "sockets_untagged=%llu " @@ -2245,8 +2245,8 @@ static int ctrl_cmd_tag(const char *input) from_kuid(&init_user_ns, current_fsuid())); goto err; } - CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->f_count=%ld ->sk=%p\n", - input, atomic_long_read(&el_socket->file->f_count), + CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->sk_refcnt=%d ->sk=%p\n", + input, atomic_read(&el_socket->sk->sk_refcnt), el_socket->sk); if (argc < 3) { acct_tag = make_atag_from_value(0); @@ -2290,16 +2290,9 @@ static int ctrl_cmd_tag(const char *input) struct tag_ref *prev_tag_ref_entry; CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p " - "st@%p ...->f_count=%ld\n", + "st@%p ...->sk_refcnt=%d\n", input, el_socket->sk, sock_tag_entry, - atomic_long_read(&el_socket->file->f_count)); - /* - * This is a re-tagging, so release the sock_fd that was - * locked at the time of the 1st tagging. - * There is still the ref from this call's sockfd_lookup() so - * it can be done within the spinlock. - */ - sockfd_put(sock_tag_entry->socket); + atomic_read(&el_socket->sk->sk_refcnt)); prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &uid_tag_data_entry); BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry)); @@ -2319,8 +2312,12 @@ static int ctrl_cmd_tag(const char *input) res = -ENOMEM; goto err_tag_unref_put; } + /* + * Hold the sk refcount here to make sure the sk pointer cannot + * be freed and reused + */ + sock_hold(el_socket->sk); sock_tag_entry->sk = el_socket->sk; - sock_tag_entry->socket = el_socket; sock_tag_entry->pid = current->tgid; sock_tag_entry->tag = combine_atag_with_uid(acct_tag, uid_int); spin_lock_bh(&uid_tag_data_tree_lock); @@ -2347,10 +2344,11 @@ static int ctrl_cmd_tag(const char *input) atomic64_inc(&qtu_events.sockets_tagged); } spin_unlock_bh(&sock_tag_list_lock); - /* We keep the ref to the socket (file) until it is untagged */ - CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->f_count=%ld\n", + /* We keep the ref to the sk until it is untagged */ + CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->sk_refcnt=%d\n", input, sock_tag_entry, - atomic_long_read(&el_socket->file->f_count)); + atomic_read(&el_socket->sk->sk_refcnt)); + sockfd_put(el_socket); return 0; err_tag_unref_put: @@ -2358,8 +2356,8 @@ err_tag_unref_put: tag_ref_entry->num_sock_tags--; free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry); err_put: - CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->f_count=%ld\n", - input, atomic_long_read(&el_socket->file->f_count) - 1); + CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->sk_refcnt=%d\n", + input, atomic_read(&el_socket->sk->sk_refcnt) - 1); /* Release the sock_fd that was grabbed by sockfd_lookup(). */ sockfd_put(el_socket); return res; @@ -2375,17 +2373,13 @@ static int ctrl_cmd_untag(const char *input) int sock_fd = 0; struct socket *el_socket; int res, argc; - struct sock_tag *sock_tag_entry; - struct tag_ref *tag_ref_entry; - struct uid_tag_data *utd_entry; - struct proc_qtu_data *pqd_entry; argc = sscanf(input, "%c %d", &cmd, &sock_fd); CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n", input, argc, cmd, sock_fd); if (argc < 2) { res = -EINVAL; - goto err; + return res; } el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */ if (!el_socket) { @@ -2393,17 +2387,31 @@ static int ctrl_cmd_untag(const char *input) " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n", input, sock_fd, res, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - goto err; + return res; } CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n", input, atomic_long_read(&el_socket->file->f_count), el_socket->sk); + res = qtaguid_untag(el_socket, false); + sockfd_put(el_socket); + return res; +} + +int qtaguid_untag(struct socket *el_socket, bool kernel) +{ + int res; + pid_t pid; + struct sock_tag *sock_tag_entry; + struct tag_ref *tag_ref_entry; + struct uid_tag_data *utd_entry; + struct proc_qtu_data *pqd_entry; + spin_lock_bh(&sock_tag_list_lock); sock_tag_entry = get_sock_stat_nl(el_socket->sk); if (!sock_tag_entry) { spin_unlock_bh(&sock_tag_list_lock); res = -EINVAL; - goto err_put; + return res; } /* * The socket already belongs to the current process @@ -2415,20 +2423,26 @@ static int ctrl_cmd_untag(const char *input) BUG_ON(!tag_ref_entry); BUG_ON(tag_ref_entry->num_sock_tags <= 0); spin_lock_bh(&uid_tag_data_tree_lock); + if (kernel) + pid = sock_tag_entry->pid; + else + pid = current->tgid; pqd_entry = proc_qtu_data_tree_search( - &proc_qtu_data_tree, current->tgid); + &proc_qtu_data_tree, pid); /* * TODO: remove if, and start failing. * At first, we want to catch user-space code that is not * opening the /dev/xt_qtaguid. */ - if (IS_ERR_OR_NULL(pqd_entry)) + if (IS_ERR_OR_NULL(pqd_entry) || !sock_tag_entry->list.next) { pr_warn_once("qtaguid: %s(): " "User space forgot to open /dev/xt_qtaguid? " - "pid=%u tgid=%u uid=%u\n", __func__, - current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid())); - else + "pid=%u tgid=%u sk_pid=%u, uid=%u\n", __func__, + current->pid, current->tgid, sock_tag_entry->pid, + from_kuid(&init_user_ns, current_fsuid())); + } else { list_del(&sock_tag_entry->list); + } spin_unlock_bh(&uid_tag_data_tree_lock); /* * We don't free tag_ref from the utd_entry here, @@ -2437,30 +2451,17 @@ static int ctrl_cmd_untag(const char *input) tag_ref_entry->num_sock_tags--; spin_unlock_bh(&sock_tag_list_lock); /* - * Release the sock_fd that was grabbed at tag time, - * and once more for the sockfd_lookup() here. + * Release the sock_fd that was grabbed at tag time. */ - sockfd_put(sock_tag_entry->socket); - CT_DEBUG("qtaguid: ctrl_untag(%s): done. st@%p ...->f_count=%ld\n", - input, sock_tag_entry, - atomic_long_read(&el_socket->file->f_count) - 1); - sockfd_put(el_socket); + sock_put(sock_tag_entry->sk); + CT_DEBUG("qtaguid: done. st@%p ...->sk_refcnt=%d\n", + sock_tag_entry, + atomic_read(&el_socket->sk->sk_refcnt)); kfree(sock_tag_entry); atomic64_inc(&qtu_events.sockets_untagged); return 0; - -err_put: - CT_DEBUG("qtaguid: ctrl_untag(%s): done. socket->...->f_count=%ld\n", - input, atomic_long_read(&el_socket->file->f_count) - 1); - /* Release the sock_fd that was grabbed by sockfd_lookup(). */ - sockfd_put(el_socket); - return res; - -err: - CT_DEBUG("qtaguid: ctrl_untag(%s): done.\n", input); - return res; } static ssize_t qtaguid_ctrl_parse(const char *input, size_t count) diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h index 6dc14a9c6889..8178fbdfb036 100644 --- a/net/netfilter/xt_qtaguid_internal.h +++ b/net/netfilter/xt_qtaguid_internal.h @@ -256,8 +256,6 @@ struct iface_stat_work { struct sock_tag { struct rb_node sock_node; struct sock *sk; /* Only used as a number, never dereferenced */ - /* The socket is needed for sockfd_put() */ - struct socket *socket; /* Used to associate with a given pid */ struct list_head list; /* in proc_qtu_data.sock_tag_list */ pid_t pid; diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c index f6a00a3520ed..2a7190d285e6 100644 --- a/net/netfilter/xt_qtaguid_print.c +++ b/net/netfilter/xt_qtaguid_print.c @@ -24,7 +24,7 @@ #include #include #include - +#include #include "xt_qtaguid_internal.h" #include "xt_qtaguid_print.h" @@ -237,10 +237,10 @@ char *pp_sock_tag(struct sock_tag *st) tag_str = pp_tag_t(&st->tag); res = kasprintf(GFP_ATOMIC, "sock_tag@%p{" "sock_node=rb_node{...}, " - "sk=%p socket=%p (f_count=%lu), list=list_head{...}, " + "sk=%p (f_count=%d), list=list_head{...}, " "pid=%u, tag=%s}", - st, st->sk, st->socket, atomic_long_read( - &st->socket->file->f_count), + st, st->sk, atomic_read( + &st->sk->sk_refcnt), st->pid, tag_str); _bug_on_err_or_null(res); kfree(tag_str); From 907e828e010b0609016fabd0732e82b1872e0c98 Mon Sep 17 00:00:00 2001 From: Daniel Roseberg Date: Tue, 9 May 2017 13:36:35 -0700 Subject: [PATCH 0791/3220] ANDROID: sdcardfs: Don't iput if we didn't igrab If we fail to get top, top is either NULL, or igrab found that we're in the process of freeing that inode, and did not grab it. Either way, we didn't grab it, and have no business putting it. Signed-off-by: Daniel Rosenberg Bug: 38117720 Change-Id: Ie2f587483b9abb5144263156a443e89bc69b767b --- fs/sdcardfs/inode.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index f15cb11ca8fd..4f09eebd7d95 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -618,11 +618,8 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma struct inode tmp; struct inode *top = grab_top(SDCARDFS_I(inode)); - if (!top) { - release_top(SDCARDFS_I(inode)); - WARN(1, "Top value was null!\n"); + if (!top) return -EINVAL; - } /* * Permission check on sdcardfs inode. @@ -696,10 +693,8 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct inode = d_inode(dentry); top = grab_top(SDCARDFS_I(inode)); - if (!top) { - release_top(SDCARDFS_I(inode)); + if (!top) return -EINVAL; - } /* * Permission check on sdcardfs inode. From bc86c1de1dedcb4e49a5470a85dc20291334eded Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Tue, 9 May 2017 18:05:15 -0700 Subject: [PATCH 0792/3220] ANDROID: android-base.cfg: remove NETFILTER_XT_MATCH_QUOTA2_LOG There are currently a couple different implementations for this functionality. Until things are unified, remove the requirement for this kernel config. Bug: 37749708 Change-Id: I10ef038edc656185644d1dcb128658136a8c994f Signed-off-by: Steve Muckle --- android/configs/android-base.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 3c32c74c584a..23686fc15d45 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -86,7 +86,6 @@ CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y CONFIG_NETFILTER_XT_MATCH_POLICY=y CONFIG_NETFILTER_XT_MATCH_QTAGUID=y CONFIG_NETFILTER_XT_MATCH_QUOTA2=y -CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG=y CONFIG_NETFILTER_XT_MATCH_QUOTA=y CONFIG_NETFILTER_XT_MATCH_SOCKET=y CONFIG_NETFILTER_XT_MATCH_STATE=y From 21ade372581bd235525a25b4d11b72d3b4129307 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 10 May 2017 23:01:15 +0800 Subject: [PATCH 0793/3220] ANDROID: sdcardfs: fix sdcardfs_destroy_inode for the inode RCU approach According to the following commits, fs: icache RCU free inodes vfs: fix the stupidity with i_dentry in inode destructors sdcardfs_destroy_inode should be fixed for the fast path safety. Signed-off-by: Gao Xiang Change-Id: I84f43c599209d23737c7e28b499dd121cb43636d --- fs/sdcardfs/super.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index a3393e959c63..8a9c9c7adca2 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -192,9 +192,16 @@ static struct inode *sdcardfs_alloc_inode(struct super_block *sb) return &i->vfs_inode; } +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode)); +} + static void sdcardfs_destroy_inode(struct inode *inode) { - kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode)); + call_rcu(&inode->i_rcu, i_callback); } /* sdcardfs inode cache constructor */ From a8cc97d8cf4a0d0cf8ce1d99a09546745b11133e Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 10 May 2017 23:54:04 +0900 Subject: [PATCH 0794/3220] ANDROID: make PF_KEY SHA256 use RFC-compliant truncation. When using the PF_KEY interface, SHA-256 hashes are hardcoded to use 96-bit truncation. This is a violation of RFC4868, which specifies 128-bit truncation, but will not be fixed upstream due to backwards compatibility concerns and because the PF_KEY interface is deprecated in favour of netlink XFRM (which allows the app to specify an arbitrary truncation length). Change the hardcoded truncation length from 96 to 128 so that PF_KEY apps such as racoon will work with standards-compliant VPN servers. Bug: 34114242 Change-Id: Ie46bff4b6358f18117d0be241171d677d31d33f7 Signed-off-by: Lorenzo Colitti --- net/xfrm/xfrm_algo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index f07224d8b88f..9adfdd711b31 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -239,7 +239,7 @@ static struct xfrm_algo_desc aalg_list[] = { .uinfo = { .auth = { - .icv_truncbits = 96, + .icv_truncbits = 128, .icv_fullbits = 256, } }, From 1b9d700c89bd1c2a3a1b303a91f8cd03fc4b727e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 May 2017 11:21:25 +0200 Subject: [PATCH 0795/3220] ANDROID: rfkill: fix unused function warning As reported by kernelci, we get a harmless warning in this driver when CONFIG_PM is disabled: net/rfkill/core.c:810:12: warning: 'rfkill_resume' defined but not used [-Wunused-function] net/rfkill/core.c:800:12: warning: 'rfkill_suspend' defined but not used [-Wunused-function] This marks the functions as __maybe_unused to remove the #ifdef, and uses a conditional reference to the pm operations instead, which will work in any combination. The patch is needed for both android-common-4.9 and 4.4. Link: https://kernelci.org/build/id/59117c2f59b5147b06b12d54/logs/ Fixes: de6f7210e931 ("ANDROID: rfkill: Introduce CONFIG_RFKILL_PM and use instead of CONFIG_PM to power down") Cc: Nick Pelly Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- net/rfkill/core.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/net/rfkill/core.c b/net/rfkill/core.c index d778d99326df..71e1f0def5a5 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -802,8 +802,7 @@ void rfkill_resume_polling(struct rfkill *rfkill) } EXPORT_SYMBOL(rfkill_resume_polling); -#ifdef CONFIG_RFKILL_PM -static int rfkill_suspend(struct device *dev) +static __maybe_unused int rfkill_suspend(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); @@ -812,7 +811,7 @@ static int rfkill_suspend(struct device *dev) return 0; } -static int rfkill_resume(struct device *dev) +static __maybe_unused int rfkill_resume(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); bool cur; @@ -828,19 +827,13 @@ static int rfkill_resume(struct device *dev) } static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume); -#define RFKILL_PM_OPS (&rfkill_pm_ops) -#else -#define RFKILL_PM_OPS NULL -#endif static struct class rfkill_class = { .name = "rfkill", .dev_release = rfkill_release, .dev_groups = rfkill_dev_groups, .dev_uevent = rfkill_dev_uevent, -#ifdef CONFIG_RFKILL_PM - .pm = RFKILL_PM_OPS, -#endif + .pm = IS_ENABLED(CONFIG_RFKILL_PM) ? &rfkill_pm_ops : NULL, }; bool rfkill_blocked(struct rfkill *rfkill) From d3cb1ad1837e0f7d688e64f436ca96fd1199c402 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 May 2017 11:21:27 +0200 Subject: [PATCH 0796/3220] ANDROID: memory_state_time: fix undefined behavior with missing DT properties kernelci reports warnings about unintialized variable usage: drivers/misc/memory_state_time.c:351:12: warning: 'lenf' is used uninitialized in this function [-Wuninitialized] drivers/misc/memory_state_time.c:321:14: warning: 'lenb' is used uninitialized in this function [-Wuninitialized] In both cases we try to continue without a DT property but use the length that has not been assigned at this point. This rearranges the code in the two functions to bail out earlier in case of an error. The patch is needed for both android-common-4.9, 4.4 and 3.18. Link: https://kernelci.org/build/id/591177f459b5147648b12d54/logs/ Fixes: ad3c02f8b3a5 ("ANDROID: Implement memory_state_time, used by qcom,cpubw") Cc: James Carr Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/misc/memory_state_time.c | 78 ++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c index 34c797a06a31..ba94dcf09169 100644 --- a/drivers/misc/memory_state_time.c +++ b/drivers/misc/memory_state_time.c @@ -296,27 +296,31 @@ static int get_bw_buckets(struct device *dev) struct device_node *node = dev->of_node; of_property_read_u32(node, NUM_SOURCES, &num_sources); - if (of_find_property(node, BW_TBL, &lenb)) { - bandwidths = devm_kzalloc(dev, - sizeof(*bandwidths) * num_sources, GFP_KERNEL); - if (!bandwidths) - return -ENOMEM; - lenb /= sizeof(*bw_buckets); - bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets), - GFP_KERNEL); - if (!bw_buckets) { - devm_kfree(dev, bandwidths); - return -ENOMEM; - } - ret = of_property_read_u32_array(node, BW_TBL, bw_buckets, - lenb); - if (ret < 0) { - devm_kfree(dev, bandwidths); - devm_kfree(dev, bw_buckets); - pr_err("Unable to read bandwidth table from device tree.\n"); - return ret; - } + if (!of_find_property(node, BW_TBL, &lenb)) { + pr_err("Missing %s property\n", BW_TBL); + return -ENODATA; } + + bandwidths = devm_kzalloc(dev, + sizeof(*bandwidths) * num_sources, GFP_KERNEL); + if (!bandwidths) + return -ENOMEM; + lenb /= sizeof(*bw_buckets); + bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets), + GFP_KERNEL); + if (!bw_buckets) { + devm_kfree(dev, bandwidths); + return -ENOMEM; + } + ret = of_property_read_u32_array(node, BW_TBL, bw_buckets, + lenb); + if (ret < 0) { + devm_kfree(dev, bandwidths); + devm_kfree(dev, bw_buckets); + pr_err("Unable to read bandwidth table from device tree.\n"); + return ret; + } + curr_bw = 0; num_buckets = lenb; return 0; @@ -332,22 +336,26 @@ static int freq_buckets_init(struct device *dev) int ret, lenf; struct device_node *node = dev->of_node; - if (of_find_property(node, FREQ_TBL, &lenf)) { - lenf /= sizeof(*freq_buckets); - freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets), - GFP_KERNEL); - if (!freq_buckets) - return -ENOMEM; - pr_debug("freqs found len %d\n", lenf); - ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets, - lenf); - if (ret < 0) { - devm_kfree(dev, freq_buckets); - pr_err("Unable to read frequency table from device tree.\n"); - return ret; - } - pr_debug("ret freq %d\n", ret); + if (!of_find_property(node, FREQ_TBL, &lenf)) { + pr_err("Missing %s property\n", FREQ_TBL); + return -ENODATA; } + + lenf /= sizeof(*freq_buckets); + freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets), + GFP_KERNEL); + if (!freq_buckets) + return -ENOMEM; + pr_debug("freqs found len %d\n", lenf); + ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets, + lenf); + if (ret < 0) { + devm_kfree(dev, freq_buckets); + pr_err("Unable to read frequency table from device tree.\n"); + return ret; + } + pr_debug("ret freq %d\n", ret); + num_freqs = lenf; curr_freq = freq_buckets[LOWEST_FREQ]; From a347e1c4ddc50afc383ff0cc197560777ba7d95b Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Fri, 12 May 2017 10:17:05 -0700 Subject: [PATCH 0797/3220] ANDROID: android-base.cfg: remove spurious CONFIG_MODULES line CONFIG_MODULES must be enabled as part of the android base kernel configuration. There is already a line specifying the option be enabled, but there was a pre-existing line requiring it be disabled. Remove it. Bug: 38224475 Change-Id: I608de5ae68f3a03d5da4e5800bbf37cc71dff8b8 Signed-off-by: Steve Muckle --- android/configs/android-base.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 23686fc15d45..b7b997fd58c9 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -3,7 +3,6 @@ # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set -# CONFIG_MODULES is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set From 73661512d284dac703ffbeca68c675cebd648110 Mon Sep 17 00:00:00 2001 From: Jiebing Li Date: Tue, 10 Mar 2015 11:27:10 +0800 Subject: [PATCH 0798/3220] ANDROID: usb: f_mtp: return error code if transfer error in receive_file_work function receive_file_work() function should report error if it detect the urb is not successfully transfered. Patchset: mtp Change-Id: I66898afe6b6ea2c33e6ea4c5ccf47d3a56d002dc Signed-off-by: Du, Changbin Signed-off-by: Wang, Yu Signed-off-by: Russ Weight --- drivers/usb/gadget/function/f_mtp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index 0fa79b0c26ab..87ec420df0a6 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -873,6 +873,10 @@ static void receive_file_work(struct work_struct *data) usb_ep_dequeue(dev->ep_out, read_req); break; } + if (read_req->status) { + r = read_req->status; + break; + } /* if xfer_file_length is 0xFFFFFFFF, then we read until * we get a zero length packet */ From 9a83b8157a75b297e3a5f105bf209a44184ca52d Mon Sep 17 00:00:00 2001 From: Jiebing Li Date: Tue, 10 Mar 2015 11:25:50 +0800 Subject: [PATCH 0799/3220] ANDROID: usb: gadget: fix NULL pointer issue in mtp_read() pointer dev->ep_out->desc is set to NULL if MTP function is disabled during read operation. So we need to do pointer check before access it and add spin lock protection in case it's modified at another place in future. Patchset: mtp Change-Id: I96d3d685e93276c9065a1aa7b0cbbdc2e159aa6f Signed-off-by: Jiebing Li Signed-off-by: Wang, Yu Signed-off-by: Russ Weight --- drivers/usb/gadget/function/f_mtp.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c index 87ec420df0a6..b25cb3594d01 100644 --- a/drivers/usb/gadget/function/f_mtp.c +++ b/drivers/usb/gadget/function/f_mtp.c @@ -541,14 +541,10 @@ static ssize_t mtp_read(struct file *fp, char __user *buf, ssize_t r = count; unsigned xfer; int ret = 0; - size_t len; + size_t len = 0; DBG(cdev, "mtp_read(%zu)\n", count); - len = usb_ep_align_maybe(cdev->gadget, dev->ep_out, count); - if (len > MTP_BULK_BUFFER_SIZE) - return -EINVAL; - /* we will block until we're online */ DBG(cdev, "mtp_read: waiting for online state\n"); ret = wait_event_interruptible(dev->read_wq, @@ -558,6 +554,14 @@ static ssize_t mtp_read(struct file *fp, char __user *buf, goto done; } spin_lock_irq(&dev->lock); + if (dev->ep_out->desc) { + len = usb_ep_align_maybe(cdev->gadget, dev->ep_out, count); + if (len > MTP_BULK_BUFFER_SIZE) { + spin_unlock_irq(&dev->lock); + return -EINVAL; + } + } + if (dev->state == STATE_CANCELED) { /* report cancelation to userspace */ dev->state = STATE_READY; From 2ba2ada07d6ffeb25a9dd0d25cc334574bbaaed1 Mon Sep 17 00:00:00 2001 From: Tim Murray Date: Fri, 2 Sep 2016 16:04:41 -0700 Subject: [PATCH 0800/3220] ANDROID: lowmemorykiller: account for unevictable pages lowmemorykiller was not taking into account unevictable pages when deciding what level to kill. If significant amounts of memory were pinned, this caused lowmemorykiller to effectively stop at a much higher level than it should. bug 31255977 Change-Id: I763ecbfef8c56d65bb8f6147ae810692bd81b6e2 Signed-off-by: Tim Murray --- drivers/staging/android/lowmemorykiller.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index af49af0cca01..606d13a1ea74 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -94,6 +94,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages; int other_file = global_page_state(NR_FILE_PAGES) - global_page_state(NR_SHMEM) - + global_page_state(NR_UNEVICTABLE) - total_swapcache_pages(); if (lowmem_adj_size < array_size) From 9516c5d5de8e54e790d1db484527681ee46e3b01 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 15 May 2017 14:03:15 -0700 Subject: [PATCH 0801/3220] ANDROID: sdcardfs: Move top to its own struct Move top, and the associated data, to its own struct. This way, we can properly track refcounts on top without interfering with the inode's accounting. Signed-off-by: Daniel Rosenberg Bug: 38045152 Change-Id: I1968e480d966c3f234800b72e43670ca11e1d3fd --- fs/sdcardfs/dentry.c | 15 +++++ fs/sdcardfs/derived_perm.c | 130 +++++++++++++++++++------------------ fs/sdcardfs/inode.c | 53 ++++++++------- fs/sdcardfs/lookup.c | 5 +- fs/sdcardfs/main.c | 8 +-- fs/sdcardfs/packagelist.c | 2 +- fs/sdcardfs/sdcardfs.h | 118 +++++++++++++++++++++------------ fs/sdcardfs/super.c | 49 ++++++++++++-- 8 files changed, 239 insertions(+), 141 deletions(-) diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index 7a19e77fce99..83ae9103a0f6 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -34,6 +34,8 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags) struct dentry *parent_lower_dentry = NULL; struct dentry *lower_cur_parent_dentry = NULL; struct dentry *lower_dentry = NULL; + struct inode *inode; + struct sdcardfs_inode_data *data; if (flags & LOOKUP_RCU) return -ECHILD; @@ -103,6 +105,19 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags) spin_unlock(&dentry->d_lock); spin_unlock(&lower_dentry->d_lock); } + if (!err) + goto out; + + /* If our top's inode is gone, we may be out of date */ + inode = d_inode(dentry); + if (inode) { + data = top_data_get(SDCARDFS_I(inode)); + if (data->abandoned) { + d_drop(dentry); + err = 0; + } + data_put(data); + } out: dput(parent_dentry); diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index b4595aab5713..85a60fb5ff39 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -26,28 +26,28 @@ static void inherit_derived_state(struct inode *parent, struct inode *child) struct sdcardfs_inode_info *pi = SDCARDFS_I(parent); struct sdcardfs_inode_info *ci = SDCARDFS_I(child); - ci->perm = PERM_INHERIT; - ci->userid = pi->userid; - ci->d_uid = pi->d_uid; - ci->under_android = pi->under_android; - ci->under_cache = pi->under_cache; - ci->under_obb = pi->under_obb; - set_top(ci, pi->top); + ci->data->perm = PERM_INHERIT; + ci->data->userid = pi->data->userid; + ci->data->d_uid = pi->data->d_uid; + ci->data->under_android = pi->data->under_android; + ci->data->under_cache = pi->data->under_cache; + ci->data->under_obb = pi->data->under_obb; + set_top(ci, pi->top_data); } /* helper function for derived state */ void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, - uid_t uid, bool under_android, - struct inode *top) + uid_t uid, bool under_android, + struct sdcardfs_inode_data *top) { struct sdcardfs_inode_info *info = SDCARDFS_I(inode); - info->perm = perm; - info->userid = userid; - info->d_uid = uid; - info->under_android = under_android; - info->under_cache = false; - info->under_obb = false; + info->data->perm = perm; + info->data->userid = userid; + info->data->d_uid = uid; + info->data->under_android = under_android; + info->data->under_cache = false; + info->data->under_obb = false; set_top(info, top); } @@ -58,7 +58,8 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name) { struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry)); - struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent)); + struct sdcardfs_inode_data *parent_data = + SDCARDFS_I(d_inode(parent))->data; appid_t appid; unsigned long user_num; int err; @@ -82,60 +83,61 @@ void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, if (!S_ISDIR(d_inode(dentry)->i_mode)) return; /* Derive custom permissions based on parent and current node */ - switch (parent_info->perm) { + switch (parent_data->perm) { case PERM_INHERIT: case PERM_ANDROID_PACKAGE_CACHE: /* Already inherited above */ break; case PERM_PRE_ROOT: /* Legacy internal layout places users at top level */ - info->perm = PERM_ROOT; + info->data->perm = PERM_ROOT; err = kstrtoul(name->name, 10, &user_num); if (err) - info->userid = 0; + info->data->userid = 0; else - info->userid = user_num; - set_top(info, &info->vfs_inode); + info->data->userid = user_num; + set_top(info, info->data); break; case PERM_ROOT: /* Assume masked off by default. */ if (qstr_case_eq(name, &q_Android)) { /* App-specific directories inside; let anyone traverse */ - info->perm = PERM_ANDROID; - info->under_android = true; - set_top(info, &info->vfs_inode); + info->data->perm = PERM_ANDROID; + info->data->under_android = true; + set_top(info, info->data); } break; case PERM_ANDROID: if (qstr_case_eq(name, &q_data)) { /* App-specific directories inside; let anyone traverse */ - info->perm = PERM_ANDROID_DATA; - set_top(info, &info->vfs_inode); + info->data->perm = PERM_ANDROID_DATA; + set_top(info, info->data); } else if (qstr_case_eq(name, &q_obb)) { /* App-specific directories inside; let anyone traverse */ - info->perm = PERM_ANDROID_OBB; - info->under_obb = true; - set_top(info, &info->vfs_inode); + info->data->perm = PERM_ANDROID_OBB; + info->data->under_obb = true; + set_top(info, info->data); /* Single OBB directory is always shared */ } else if (qstr_case_eq(name, &q_media)) { /* App-specific directories inside; let anyone traverse */ - info->perm = PERM_ANDROID_MEDIA; - set_top(info, &info->vfs_inode); + info->data->perm = PERM_ANDROID_MEDIA; + set_top(info, info->data); } break; case PERM_ANDROID_OBB: case PERM_ANDROID_DATA: case PERM_ANDROID_MEDIA: - info->perm = PERM_ANDROID_PACKAGE; + info->data->perm = PERM_ANDROID_PACKAGE; appid = get_appid(name->name); - if (appid != 0 && !is_excluded(name->name, parent_info->userid)) - info->d_uid = multiuser_get_uid(parent_info->userid, appid); - set_top(info, &info->vfs_inode); + if (appid != 0 && !is_excluded(name->name, parent_data->userid)) + info->data->d_uid = + multiuser_get_uid(parent_data->userid, appid); + set_top(info, info->data); break; case PERM_ANDROID_PACKAGE: if (qstr_case_eq(name, &q_cache)) { - info->perm = PERM_ANDROID_PACKAGE_CACHE; - info->under_cache = true; + info->data->perm = PERM_ANDROID_PACKAGE_CACHE; + info->data->under_cache = true; } break; } @@ -166,7 +168,8 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name) struct inode *delegated_inode = NULL; int error; struct sdcardfs_inode_info *info; - struct sdcardfs_inode_info *info_top; + struct sdcardfs_inode_data *info_d; + struct sdcardfs_inode_data *info_top; perm_t perm; struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); uid_t uid = sbi->options.fs_low_uid; @@ -174,15 +177,16 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name) struct iattr newattrs; info = SDCARDFS_I(d_inode(dentry)); - perm = info->perm; - if (info->under_obb) { + info_d = info->data; + perm = info_d->perm; + if (info_d->under_obb) { perm = PERM_ANDROID_OBB; - } else if (info->under_cache) { + } else if (info_d->under_cache) { perm = PERM_ANDROID_PACKAGE_CACHE; } else if (perm == PERM_INHERIT) { - info_top = SDCARDFS_I(grab_top(info)); + info_top = top_data_get(info); perm = info_top->perm; - release_top(info); + data_put(info_top); } switch (perm) { @@ -192,7 +196,7 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name) case PERM_ANDROID_MEDIA: case PERM_ANDROID_PACKAGE: case PERM_ANDROID_PACKAGE_CACHE: - uid = multiuser_get_uid(info->userid, uid); + uid = multiuser_get_uid(info_d->userid, uid); break; case PERM_ANDROID_OBB: uid = AID_MEDIA_OBB; @@ -207,24 +211,24 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name) case PERM_ANDROID_DATA: case PERM_ANDROID_MEDIA: if (S_ISDIR(d_inode(dentry)->i_mode)) - gid = multiuser_get_uid(info->userid, AID_MEDIA_RW); + gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW); else - gid = multiuser_get_uid(info->userid, get_type(name)); + gid = multiuser_get_uid(info_d->userid, get_type(name)); break; case PERM_ANDROID_OBB: gid = AID_MEDIA_OBB; break; case PERM_ANDROID_PACKAGE: - if (uid_is_app(info->d_uid)) - gid = multiuser_get_ext_gid(info->d_uid); + if (uid_is_app(info_d->d_uid)) + gid = multiuser_get_ext_gid(info_d->d_uid); else - gid = multiuser_get_uid(info->userid, AID_MEDIA_RW); + gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW); break; case PERM_ANDROID_PACKAGE_CACHE: - if (uid_is_app(info->d_uid)) - gid = multiuser_get_ext_cache_gid(info->d_uid); + if (uid_is_app(info_d->d_uid)) + gid = multiuser_get_ext_cache_gid(info_d->d_uid); else - gid = multiuser_get_uid(info->userid, AID_MEDIA_RW); + gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW); break; case PERM_PRE_ROOT: default: @@ -257,11 +261,13 @@ retry_deleg: sdcardfs_put_lower_path(dentry, &path); } -static int descendant_may_need_fixup(struct sdcardfs_inode_info *info, struct limit_search *limit) +static int descendant_may_need_fixup(struct sdcardfs_inode_data *data, + struct limit_search *limit) { - if (info->perm == PERM_ROOT) - return (limit->flags & BY_USERID)?info->userid == limit->userid:1; - if (info->perm == PERM_PRE_ROOT || info->perm == PERM_ANDROID) + if (data->perm == PERM_ROOT) + return (limit->flags & BY_USERID) ? + data->userid == limit->userid : 1; + if (data->perm == PERM_PRE_ROOT || data->perm == PERM_ANDROID) return 1; return 0; } @@ -292,7 +298,7 @@ static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search * } info = SDCARDFS_I(d_inode(dentry)); - if (needs_fixup(info->perm)) { + if (needs_fixup(info->data->perm)) { list_for_each_entry(child, &dentry->d_subdirs, d_child) { spin_lock_nested(&child->d_lock, depth + 1); if (!(limit->flags & BY_NAME) || qstr_case_eq(&child->d_name, &limit->name)) { @@ -305,7 +311,7 @@ static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search * } spin_unlock(&child->d_lock); } - } else if (descendant_may_need_fixup(info, limit)) { + } else if (descendant_may_need_fixup(info->data, limit)) { list_for_each_entry(child, &dentry->d_subdirs, d_child) { __fixup_perms_recursive(child, limit, depth + 1); } @@ -349,12 +355,12 @@ int need_graft_path(struct dentry *dentry) struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); struct qstr obb = QSTR_LITERAL("obb"); - if (parent_info->perm == PERM_ANDROID && + if (parent_info->data->perm == PERM_ANDROID && qstr_case_eq(&dentry->d_name, &obb)) { /* /Android/obb is the base obbpath of DERIVED_UNIFIED */ if (!(sbi->options.multiuser == false - && parent_info->userid == 0)) { + && parent_info->data->userid == 0)) { ret = 1; } } @@ -415,11 +421,11 @@ int is_base_obbpath(struct dentry *dentry) spin_lock(&SDCARDFS_D(dentry)->lock); if (sbi->options.multiuser) { - if (parent_info->perm == PERM_PRE_ROOT && + if (parent_info->data->perm == PERM_PRE_ROOT && qstr_case_eq(&dentry->d_name, &q_obb)) { ret = 1; } - } else if (parent_info->perm == PERM_ANDROID && + } else if (parent_info->data->perm == PERM_ANDROID && qstr_case_eq(&dentry->d_name, &q_obb)) { ret = 1; } diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 4f09eebd7d95..60fea424835f 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -23,7 +23,8 @@ #include /* Do not directly use this function. Use OVERRIDE_CRED() instead. */ -const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_inode_info *info) +const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, + struct sdcardfs_inode_data *data) { struct cred *cred; const struct cred *old_cred; @@ -33,10 +34,10 @@ const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_ if (!cred) return NULL; - if (info->under_obb) + if (data->under_obb) uid = AID_MEDIA_OBB; else - uid = multiuser_get_uid(info->userid, sbi->options.fs_low_uid); + uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid); cred->fsuid = make_kuid(&init_user_ns, uid); cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid); @@ -96,7 +97,8 @@ static int sdcardfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out; - err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, SDCARDFS_I(dir)->userid); + err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, + SDCARDFS_I(dir)->data->userid); if (err) goto out; fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir)); @@ -267,7 +269,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct path lower_path; struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); const struct cred *saved_cred = NULL; - struct sdcardfs_inode_info *pi = SDCARDFS_I(dir); + struct sdcardfs_inode_data *pd = SDCARDFS_I(dir)->data; int touch_err = 0; struct fs_struct *saved_fs; struct fs_struct *copied_fs; @@ -336,7 +338,7 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode make_nomedia_in_obb = 1; } - err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pi->userid); + err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pd->userid); if (err) { unlock_dir(lower_parent_dentry); goto out; @@ -349,12 +351,13 @@ static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode fixup_lower_ownership(dentry, dentry->d_name.name); unlock_dir(lower_parent_dentry); if ((!sbi->options.multiuser) && (qstr_case_eq(&dentry->d_name, &q_obb)) - && (pi->perm == PERM_ANDROID) && (pi->userid == 0)) + && (pd->perm == PERM_ANDROID) && (pd->userid == 0)) make_nomedia_in_obb = 1; /* When creating /Android/data and /Android/obb, mark them as .nomedia */ if (make_nomedia_in_obb || - ((pi->perm == PERM_ANDROID) && (qstr_case_eq(&dentry->d_name, &q_data)))) { + ((pd->perm == PERM_ANDROID) + && (qstr_case_eq(&dentry->d_name, &q_data)))) { REVERT_CRED(saved_cred); OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(d_inode(dentry))); set_fs_pwd(current->fs, &lower_path); @@ -616,7 +619,7 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma { int err; struct inode tmp; - struct inode *top = grab_top(SDCARDFS_I(inode)); + struct sdcardfs_inode_data *top = top_data_get(SDCARDFS_I(inode)); if (!top) return -EINVAL; @@ -633,10 +636,11 @@ static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int ma * locks must be dealt with to avoid undefined behavior. */ copy_attrs(&tmp, inode); - tmp.i_uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid); - tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top))); - tmp.i_mode = (inode->i_mode & S_IFMT) | get_mode(mnt, SDCARDFS_I(top)); - release_top(SDCARDFS_I(inode)); + tmp.i_uid = make_kuid(&init_user_ns, top->d_uid); + tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, top)); + tmp.i_mode = (inode->i_mode & S_IFMT) + | get_mode(mnt, SDCARDFS_I(inode), top); + data_put(top); tmp.i_sb = inode->i_sb; if (IS_POSIXACL(inode)) pr_warn("%s: This may be undefined behavior...\n", __func__); @@ -687,11 +691,11 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct struct iattr lower_ia; struct dentry *parent; struct inode tmp; - struct inode *top; + struct sdcardfs_inode_data *top; const struct cred *saved_cred = NULL; inode = d_inode(dentry); - top = grab_top(SDCARDFS_I(inode)); + top = top_data_get(SDCARDFS_I(inode)); if (!top) return -EINVAL; @@ -709,11 +713,12 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct * */ copy_attrs(&tmp, inode); - tmp.i_uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid); - tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top))); - tmp.i_mode = (inode->i_mode & S_IFMT) | get_mode(mnt, SDCARDFS_I(top)); + tmp.i_uid = make_kuid(&init_user_ns, top->d_uid); + tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, top)); + tmp.i_mode = (inode->i_mode & S_IFMT) + | get_mode(mnt, SDCARDFS_I(inode), top); tmp.i_size = i_size_read(inode); - release_top(SDCARDFS_I(inode)); + data_put(top); tmp.i_sb = inode->i_sb; /* @@ -815,17 +820,17 @@ static int sdcardfs_fillattr(struct vfsmount *mnt, struct inode *inode, struct kstat *stat) { struct sdcardfs_inode_info *info = SDCARDFS_I(inode); - struct inode *top = grab_top(info); + struct sdcardfs_inode_data *top = top_data_get(info); if (!top) return -EINVAL; stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; - stat->mode = (inode->i_mode & S_IFMT) | get_mode(mnt, SDCARDFS_I(top)); + stat->mode = (inode->i_mode & S_IFMT) | get_mode(mnt, info, top); stat->nlink = inode->i_nlink; - stat->uid = make_kuid(&init_user_ns, SDCARDFS_I(top)->d_uid); - stat->gid = make_kgid(&init_user_ns, get_gid(mnt, SDCARDFS_I(top))); + stat->uid = make_kuid(&init_user_ns, top->d_uid); + stat->gid = make_kgid(&init_user_ns, get_gid(mnt, top)); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode->i_atime; @@ -833,7 +838,7 @@ static int sdcardfs_fillattr(struct vfsmount *mnt, stat->ctime = inode->i_ctime; stat->blksize = (1 << inode->i_blkbits); stat->blocks = inode->i_blocks; - release_top(info); + data_put(top); return 0; } diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 509d5fbcb472..00ae21151f52 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -71,7 +71,7 @@ struct inode_data { static int sdcardfs_inode_test(struct inode *inode, void *candidate_data/*void *candidate_lower_inode*/) { struct inode *current_lower_inode = sdcardfs_lower_inode(inode); - userid_t current_userid = SDCARDFS_I(inode)->userid; + userid_t current_userid = SDCARDFS_I(inode)->data->userid; if (current_lower_inode == ((struct inode_data *)candidate_data)->lower_inode && current_userid == ((struct inode_data *)candidate_data)->id) @@ -438,7 +438,8 @@ struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path, SDCARDFS_I(dir)->userid); + ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path, + SDCARDFS_I(dir)->data->userid); if (IS_ERR(ret)) goto out; if (ret) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 953d2156d2e9..3c5b51d49d21 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -327,13 +327,13 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, mutex_lock(&sdcardfs_super_list_lock); if (sb_info->options.multiuser) { setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT, - sb_info->options.fs_user_id, AID_ROOT, - false, d_inode(sb->s_root)); + sb_info->options.fs_user_id, AID_ROOT, + false, SDCARDFS_I(d_inode(sb->s_root))->data); snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name); } else { setup_derived_state(d_inode(sb->s_root), PERM_ROOT, - sb_info->options.fs_user_id, AID_ROOT, - false, d_inode(sb->s_root)); + sb_info->options.fs_user_id, AID_ROOT, + false, SDCARDFS_I(d_inode(sb->s_root))->data); snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name); } fixup_tmp_permissions(d_inode(sb->s_root)); diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 89196e31073e..8495474258d5 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -156,7 +156,7 @@ int check_caller_access_to_name(struct inode *parent_node, const struct qstr *na struct qstr q_android_secure = QSTR_LITERAL("android_secure"); /* Always block security-sensitive files at root */ - if (parent_node && SDCARDFS_I(parent_node)->perm == PERM_ROOT) { + if (parent_node && SDCARDFS_I(parent_node)->data->perm == PERM_ROOT) { if (qstr_case_eq(name, &q_autorun) || qstr_case_eq(name, &q__android_secure) || qstr_case_eq(name, &q_android_secure)) { diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 2b67b9a8ef9f..31d37d7da4f9 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -81,7 +82,8 @@ */ #define fixup_tmp_permissions(x) \ do { \ - (x)->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(x)->d_uid); \ + (x)->i_uid = make_kuid(&init_user_ns, \ + SDCARDFS_I(x)->data->d_uid); \ (x)->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW); \ (x)->i_mode = ((x)->i_mode & S_IFMT) | 0775;\ } while (0) @@ -97,16 +99,16 @@ */ #define OVERRIDE_CRED(sdcardfs_sbi, saved_cred, info) \ do { \ - saved_cred = override_fsids(sdcardfs_sbi, info); \ - if (!saved_cred) \ - return -ENOMEM; \ + saved_cred = override_fsids(sdcardfs_sbi, info->data); \ + if (!saved_cred) \ + return -ENOMEM; \ } while (0) #define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred, info) \ do { \ - saved_cred = override_fsids(sdcardfs_sbi, info); \ - if (!saved_cred) \ - return ERR_PTR(-ENOMEM); \ + saved_cred = override_fsids(sdcardfs_sbi, info->data); \ + if (!saved_cred) \ + return ERR_PTR(-ENOMEM); \ } while (0) #define REVERT_CRED(saved_cred) revert_fsids(saved_cred) @@ -142,9 +144,11 @@ typedef enum { struct sdcardfs_sb_info; struct sdcardfs_mount_options; struct sdcardfs_inode_info; +struct sdcardfs_inode_data; /* Do not directly use this function. Use OVERRIDE_CRED() instead. */ -const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, struct sdcardfs_inode_info *info); +const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, + struct sdcardfs_inode_data *data); /* Do not directly use this function, use REVERT_CRED() instead. */ void revert_fsids(const struct cred *old_cred); @@ -178,18 +182,26 @@ struct sdcardfs_file_info { const struct vm_operations_struct *lower_vm_ops; }; -/* sdcardfs inode data in memory */ -struct sdcardfs_inode_info { - struct inode *lower_inode; - /* state derived based on current position in hierachy */ +struct sdcardfs_inode_data { + struct kref refcount; + bool abandoned; + perm_t perm; userid_t userid; uid_t d_uid; bool under_android; bool under_cache; bool under_obb; +}; + +/* sdcardfs inode data in memory */ +struct sdcardfs_inode_info { + struct inode *lower_inode; + /* state derived based on current position in hierarchy */ + struct sdcardfs_inode_data *data; + /* top folder for ownership */ - struct inode *top; + struct sdcardfs_inode_data *top_data; struct inode vfs_inode; }; @@ -351,39 +363,56 @@ SDCARDFS_DENT_FUNC(orig_path) static inline bool sbinfo_has_sdcard_magic(struct sdcardfs_sb_info *sbinfo) { - return sbinfo && sbinfo->sb && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC; + return sbinfo && sbinfo->sb + && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC; } -/* grab a refererence if we aren't linking to ourself */ -static inline void set_top(struct sdcardfs_inode_info *info, struct inode *top) +static inline struct sdcardfs_inode_data *data_get( + struct sdcardfs_inode_data *data) { - struct inode *old_top = NULL; - - BUG_ON(IS_ERR_OR_NULL(top)); - if (info->top && info->top != &info->vfs_inode) - old_top = info->top; - if (top != &info->vfs_inode) - igrab(top); - info->top = top; - iput(old_top); + if (data) + kref_get(&data->refcount); + return data; } -static inline struct inode *grab_top(struct sdcardfs_inode_info *info) +static inline struct sdcardfs_inode_data *top_data_get( + struct sdcardfs_inode_info *info) { - struct inode *top = info->top; + return data_get(info->top_data); +} + +extern void data_release(struct kref *ref); + +static inline void data_put(struct sdcardfs_inode_data *data) +{ + kref_put(&data->refcount, data_release); +} + +static inline void release_own_data(struct sdcardfs_inode_info *info) +{ + /* + * This happens exactly once per inode. At this point, the inode that + * originally held this data is about to be freed, and all references + * to it are held as a top value, and will likely be released soon. + */ + info->data->abandoned = true; + data_put(info->data); +} + +static inline void set_top(struct sdcardfs_inode_info *info, + struct sdcardfs_inode_data *top) +{ + struct sdcardfs_inode_data *old_top = info->top_data; if (top) - return igrab(top); - else - return NULL; + data_get(top); + info->top_data = top; + if (old_top) + data_put(old_top); } -static inline void release_top(struct sdcardfs_inode_info *info) -{ - iput(info->top); -} - -static inline int get_gid(struct vfsmount *mnt, struct sdcardfs_inode_info *info) +static inline int get_gid(struct vfsmount *mnt, + struct sdcardfs_inode_data *data) { struct sdcardfs_vfsmount_options *opts = mnt->data; @@ -396,10 +425,12 @@ static inline int get_gid(struct vfsmount *mnt, struct sdcardfs_inode_info *info */ return AID_SDCARD_RW; else - return multiuser_get_uid(info->userid, opts->gid); + return multiuser_get_uid(data->userid, opts->gid); } -static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *info) +static inline int get_mode(struct vfsmount *mnt, + struct sdcardfs_inode_info *info, + struct sdcardfs_inode_data *data) { int owner_mode; int filtered_mode; @@ -407,12 +438,12 @@ static inline int get_mode(struct vfsmount *mnt, struct sdcardfs_inode_info *inf int visible_mode = 0775 & ~opts->mask; - if (info->perm == PERM_PRE_ROOT) { + if (data->perm == PERM_PRE_ROOT) { /* Top of multi-user view should always be visible to ensure * secondary users can traverse inside. */ visible_mode = 0711; - } else if (info->under_android) { + } else if (data->under_android) { /* Block "other" access to Android directories, since only apps * belonging to a specific user should be in there; we still * leave +x open for the default view. @@ -481,8 +512,9 @@ struct limit_search { userid_t userid; }; -extern void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid, - uid_t uid, bool under_android, struct inode *top); +extern void setup_derived_state(struct inode *inode, perm_t perm, + userid_t userid, uid_t uid, bool under_android, + struct sdcardfs_inode_data *top); extern void get_derived_permission(struct dentry *parent, struct dentry *dentry); extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name); extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit); @@ -601,7 +633,7 @@ static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct { dest->i_mode = (src->i_mode & S_IFMT) | S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH; /* 0775 */ - dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->d_uid); + dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->data->d_uid); dest->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW); dest->i_rdev = src->i_rdev; dest->i_atime = src->i_atime; diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index 8a9c9c7adca2..7f4539b4b249 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -26,6 +26,23 @@ */ static struct kmem_cache *sdcardfs_inode_cachep; +/* + * To support the top references, we must track some data separately. + * An sdcardfs_inode_info always has a reference to its data, and once set up, + * also has a reference to its top. The top may be itself, in which case it + * holds two references to its data. When top is changed, it takes a ref to the + * new data and then drops the ref to the old data. + */ +static struct kmem_cache *sdcardfs_inode_data_cachep; + +void data_release(struct kref *ref) +{ + struct sdcardfs_inode_data *data = + container_of(ref, struct sdcardfs_inode_data, refcount); + + kmem_cache_free(sdcardfs_inode_data_cachep, data); +} + /* final actions when unmounting a file system */ static void sdcardfs_put_super(struct super_block *sb) { @@ -166,6 +183,7 @@ static void sdcardfs_evict_inode(struct inode *inode) struct inode *lower_inode; truncate_inode_pages(&inode->i_data, 0); + set_top(SDCARDFS_I(inode), NULL); clear_inode(inode); /* * Decrement a reference to a lower_inode, which was incremented @@ -173,13 +191,13 @@ static void sdcardfs_evict_inode(struct inode *inode) */ lower_inode = sdcardfs_lower_inode(inode); sdcardfs_set_lower_inode(inode, NULL); - set_top(SDCARDFS_I(inode), inode); iput(lower_inode); } static struct inode *sdcardfs_alloc_inode(struct super_block *sb) { struct sdcardfs_inode_info *i; + struct sdcardfs_inode_data *d; i = kmem_cache_alloc(sdcardfs_inode_cachep, GFP_KERNEL); if (!i) @@ -188,6 +206,16 @@ static struct inode *sdcardfs_alloc_inode(struct super_block *sb) /* memset everything up to the inode to 0 */ memset(i, 0, offsetof(struct sdcardfs_inode_info, vfs_inode)); + d = kmem_cache_alloc(sdcardfs_inode_data_cachep, + GFP_KERNEL | __GFP_ZERO); + if (!d) { + kmem_cache_free(sdcardfs_inode_cachep, i); + return NULL; + } + + i->data = d; + kref_init(&d->refcount); + i->vfs_inode.i_version = 1; return &i->vfs_inode; } @@ -196,6 +224,7 @@ static void i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); + release_own_data(SDCARDFS_I(inode)); kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode)); } @@ -214,20 +243,30 @@ static void init_once(void *obj) int sdcardfs_init_inode_cache(void) { - int err = 0; - sdcardfs_inode_cachep = kmem_cache_create("sdcardfs_inode_cache", sizeof(struct sdcardfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT, init_once); + if (!sdcardfs_inode_cachep) - err = -ENOMEM; - return err; + return -ENOMEM; + + sdcardfs_inode_data_cachep = + kmem_cache_create("sdcardfs_inode_data_cache", + sizeof(struct sdcardfs_inode_data), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!sdcardfs_inode_data_cachep) { + kmem_cache_destroy(sdcardfs_inode_cachep); + return -ENOMEM; + } + + return 0; } /* sdcardfs inode cache destructor */ void sdcardfs_destroy_inode_cache(void) { + kmem_cache_destroy(sdcardfs_inode_data_cachep); kmem_cache_destroy(sdcardfs_inode_cachep); } From e5272d4c20b3270520a7f860cbe4d85da3728507 Mon Sep 17 00:00:00 2001 From: David Zeuthen Date: Wed, 10 May 2017 15:12:19 -0400 Subject: [PATCH 0802/3220] ANDROID: AVB: Only invalidate vbmeta when told to do so. When using AVB, the dm-verity error handling mode is now customizeable by the bootloader. This CL is for the kernel-side support - it makes the AVB error handler invalidate the vbmeta partition only if androidboot.vbmeta.invalidate_on_error is set to "yes". Bug: 38157502 Test: Manually tested all dm-verity error modes on UEFI-based bootloader. Change-Id: If36b8a5be9ffd6120e7e0c162843e732eba2ce26 Signed-off-by: David Zeuthen --- drivers/md/dm-verity-avb.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c index 88487346c4c6..a0ee31e0dd09 100644 --- a/drivers/md/dm-verity-avb.c +++ b/drivers/md/dm-verity-avb.c @@ -12,8 +12,9 @@ #define DM_MSG_PREFIX "verity-avb" -/* Set via module parameter. */ +/* Set via module parameters. */ static char avb_vbmeta_device[64]; +static char avb_invalidate_on_error[4]; static void invalidate_vbmeta_endio(struct bio *bio) { @@ -175,6 +176,11 @@ void dm_verity_avb_error_handler(void) DMINFO("AVB error handler called for %s", avb_vbmeta_device); + if (strcmp(avb_invalidate_on_error, "yes") != 0) { + DMINFO("Not configured to invalidate"); + return; + } + if (avb_vbmeta_device[0] == '\0') { DMERR("avb_vbmeta_device parameter not set"); goto fail_no_dev; @@ -215,3 +221,5 @@ MODULE_LICENSE("GPL"); #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "androidboot.vbmeta." module_param_string(device, avb_vbmeta_device, sizeof(avb_vbmeta_device), 0); +module_param_string(invalidate_on_error, avb_invalidate_on_error, + sizeof(avb_invalidate_on_error), 0); From a533221d29ec146359e20edabc98e1a00098a106 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 22 May 2017 13:23:56 -0700 Subject: [PATCH 0803/3220] ANDROID: sdcardfs: Check for NULL in revalidate If the inode is in the process of being evicted, the top value may be NULL. Signed-off-by: Daniel Rosenberg Bug: 38502532 Change-Id: I0b9d04aab621e0398d44d1c5dc53293106aa5f89 --- fs/sdcardfs/dentry.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c index 83ae9103a0f6..13da7e5245bd 100644 --- a/fs/sdcardfs/dentry.c +++ b/fs/sdcardfs/dentry.c @@ -109,14 +109,16 @@ static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags) goto out; /* If our top's inode is gone, we may be out of date */ - inode = d_inode(dentry); + inode = igrab(d_inode(dentry)); if (inode) { data = top_data_get(SDCARDFS_I(inode)); - if (data->abandoned) { + if (!data || data->abandoned) { d_drop(dentry); err = 0; } - data_put(data); + if (data) + data_put(data); + iput(inode); } out: From 815c6db10b0741ba10e6c7d3e7fcdc0777bc8918 Mon Sep 17 00:00:00 2001 From: David Zeuthen Date: Mon, 22 May 2017 17:49:40 -0400 Subject: [PATCH 0804/3220] ANDROID: AVB: Fix invalidate_vbmeta_submit(). On some boards with newer kernels the invalidate_vbmeta_submit() function failed likely because it's using a hard-coded size of 512. Use PAGE_SIZE and also use bio_add_page(). Also print out if the I/O actually fails. Additionally, print major:minor of the device we're acting on since another subset of boards get this wrong by passing the wrong GUID to libavb in the bootloader. Having this information printed out makes it a lot easier to pinpoint such mistakes. Test: Manually tested. Bug: 38451312 Change-Id: Ib58548953a3375a3c79acf74aa745be9420b8216 Signed-off-by: David Zeuthen --- drivers/md/dm-verity-avb.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c index a0ee31e0dd09..7486dd7fb16d 100644 --- a/drivers/md/dm-verity-avb.c +++ b/drivers/md/dm-verity-avb.c @@ -18,6 +18,8 @@ static char avb_invalidate_on_error[4]; static void invalidate_vbmeta_endio(struct bio *bio) { + if (bio->bi_error) + DMERR("invalidate_vbmeta_endio: error %d", bio->bi_error); complete(bio->bi_private); } @@ -31,20 +33,17 @@ static int invalidate_vbmeta_submit(struct bio *bio, bio->bi_private = &wait; bio->bi_end_io = invalidate_vbmeta_endio; bio->bi_bdev = bdev; + bio->bi_rw = rw; bio->bi_iter.bi_sector = 0; if (access_last_sector) { sector_t last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1; bio->bi_iter.bi_sector = last_sector; } - bio->bi_vcnt = 1; - bio->bi_iter.bi_idx = 0; - bio->bi_iter.bi_size = 512; - bio->bi_iter.bi_bvec_done = 0; - bio->bi_rw = rw; - bio->bi_io_vec[0].bv_page = page; - bio->bi_io_vec[0].bv_len = 512; - bio->bi_io_vec[0].bv_offset = 0; + if (!bio_add_page(bio, page, PAGE_SIZE, 0)) { + DMERR("invalidate_vbmeta_submit: bio_add_page error"); + return -EIO; + } submit_bio(rw, bio); /* Wait up to 2 seconds for completion or fail. */ @@ -66,6 +65,9 @@ static int invalidate_vbmeta(dev_t vbmeta_devt) int rw = REQ_SYNC | REQ_SOFTBARRIER | REQ_NOIDLE; int access_last_sector = 0; + DMINFO("invalidate_vbmeta: acting on device %d:%d", + MAJOR(vbmeta_devt), MINOR(vbmeta_devt)); + /* First we open the device for reading. */ dev_mode = FMODE_READ | FMODE_EXCL; bdev = blkdev_get_by_dev(vbmeta_devt, dev_mode, From f0893c7ddba2580e25ce9386ec7f4569c9a8b5fb Mon Sep 17 00:00:00 2001 From: David Zeuthen Date: Tue, 23 May 2017 17:45:39 -0400 Subject: [PATCH 0805/3220] ANDROID: AVB: Fix linter errors. Various other kernel trees complained about style problems with dm-verity-avb.c, specifically some lines being wider than 80 characters and spaces being used instead of tabs. With these changes checkpatch.pl is happy: $ scripts/checkpatch.pl --file drivers/md/dm-verity-avb.c total: 0 errors, 0 warnings, 229 lines checked drivers/md/dm-verity-avb.c has no obvious style problems and is ready for submission. Bug: None Test: Compiles. Change-Id: I08913adf61c5bf2b0f78f7d9e18dbe93feaba9f7 Signed-off-by: David Zeuthen --- drivers/md/dm-verity-avb.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-verity-avb.c b/drivers/md/dm-verity-avb.c index 7486dd7fb16d..727aacbb1480 100644 --- a/drivers/md/dm-verity-avb.c +++ b/drivers/md/dm-verity-avb.c @@ -37,7 +37,9 @@ static int invalidate_vbmeta_submit(struct bio *bio, bio->bi_iter.bi_sector = 0; if (access_last_sector) { - sector_t last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1; + sector_t last_sector; + + last_sector = (i_size_read(bdev->bd_inode)>>SECTOR_SHIFT) - 1; bio->bi_iter.bi_sector = last_sector; } if (!bio_add_page(bio, page, PAGE_SIZE, 0)) { @@ -118,7 +120,7 @@ static int invalidate_vbmeta(dev_t vbmeta_devt) goto failed_to_submit_read; } if (memcmp("AVBf", page_address(page) + offset, 4) != 0) { - DMERR("invalidate_vbmeta called on non-vbmeta partition"); + DMERR("invalidate_vbmeta on non-vbmeta partition"); ret = -EINVAL; goto invalid_header; } @@ -224,4 +226,4 @@ MODULE_LICENSE("GPL"); #define MODULE_PARAM_PREFIX "androidboot.vbmeta." module_param_string(device, avb_vbmeta_device, sizeof(avb_vbmeta_device), 0); module_param_string(invalidate_on_error, avb_invalidate_on_error, - sizeof(avb_invalidate_on_error), 0); + sizeof(avb_invalidate_on_error), 0); From 200f49cf4a70602a18e34f171eaccba6e561e36a Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Mon, 22 May 2017 12:08:06 -0700 Subject: [PATCH 0806/3220] ANDROID: uid_sys_stats: defer io stats calulation for dead tasks Store sum of dead task io stats in uid_entry and defer uid io calulation until next uid proc stat change or dumpsys. Bug: 37754877 Change-Id: I970f010a4c841c5ca26d0efc7e027414c3c952e0 Signed-off-by: Jin Qian --- drivers/misc/uid_sys_stats.c | 107 ++++++++++++++--------------------- 1 file changed, 42 insertions(+), 65 deletions(-) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index ad21276c8d9e..091370f4ea40 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -49,7 +49,8 @@ struct io_stats { #define UID_STATE_TOTAL_CURR 2 #define UID_STATE_TOTAL_LAST 3 -#define UID_STATE_SIZE 4 +#define UID_STATE_DEAD_TASKS 4 +#define UID_STATE_SIZE 5 struct uid_entry { uid_t uid; @@ -214,35 +215,44 @@ static u64 compute_write_bytes(struct task_struct *task) return task->ioac.write_bytes - task->ioac.cancelled_write_bytes; } -static void add_uid_io_curr_stats(struct uid_entry *uid_entry, - struct task_struct *task) +static void add_uid_io_stats(struct uid_entry *uid_entry, + struct task_struct *task, int slot) { - struct io_stats *io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR]; + struct io_stats *io_slot = &uid_entry->io[slot]; - io_curr->read_bytes += task->ioac.read_bytes; - io_curr->write_bytes += compute_write_bytes(task); - io_curr->rchar += task->ioac.rchar; - io_curr->wchar += task->ioac.wchar; - io_curr->fsync += task->ioac.syscfs; + io_slot->read_bytes += task->ioac.read_bytes; + io_slot->write_bytes += compute_write_bytes(task); + io_slot->rchar += task->ioac.rchar; + io_slot->wchar += task->ioac.wchar; + io_slot->fsync += task->ioac.syscfs; } -static void clean_uid_io_last_stats(struct uid_entry *uid_entry, - struct task_struct *task) +static void compute_uid_io_bucket_stats(struct io_stats *io_bucket, + struct io_stats *io_curr, + struct io_stats *io_last, + struct io_stats *io_dead) { - struct io_stats *io_last = &uid_entry->io[UID_STATE_TOTAL_LAST]; + io_bucket->read_bytes += io_curr->read_bytes + io_dead->read_bytes - + io_last->read_bytes; + io_bucket->write_bytes += io_curr->write_bytes + io_dead->write_bytes - + io_last->write_bytes; + io_bucket->rchar += io_curr->rchar + io_dead->rchar - io_last->rchar; + io_bucket->wchar += io_curr->wchar + io_dead->wchar - io_last->wchar; + io_bucket->fsync += io_curr->fsync + io_dead->fsync - io_last->fsync; - io_last->read_bytes -= task->ioac.read_bytes; - io_last->write_bytes -= compute_write_bytes(task); - io_last->rchar -= task->ioac.rchar; - io_last->wchar -= task->ioac.wchar; - io_last->fsync -= task->ioac.syscfs; + io_last->read_bytes = io_curr->read_bytes; + io_last->write_bytes = io_curr->write_bytes; + io_last->rchar = io_curr->rchar; + io_last->wchar = io_curr->wchar; + io_last->fsync = io_curr->fsync; + + memset(io_dead, 0, sizeof(struct io_stats)); } static void update_io_stats_all_locked(void) { struct uid_entry *uid_entry; struct task_struct *task, *temp; - struct io_stats *io_bucket, *io_curr, *io_last; struct user_namespace *user_ns = current_user_ns(); unsigned long bkt; uid_t uid; @@ -257,70 +267,38 @@ static void update_io_stats_all_locked(void) uid_entry = find_or_register_uid(uid); if (!uid_entry) continue; - add_uid_io_curr_stats(uid_entry, task); + add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR); } while_each_thread(temp, task); rcu_read_unlock(); hash_for_each(hash_table, bkt, uid_entry, hash) { - io_bucket = &uid_entry->io[uid_entry->state]; - io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR]; - io_last = &uid_entry->io[UID_STATE_TOTAL_LAST]; - - io_bucket->read_bytes += - io_curr->read_bytes - io_last->read_bytes; - io_bucket->write_bytes += - io_curr->write_bytes - io_last->write_bytes; - io_bucket->rchar += io_curr->rchar - io_last->rchar; - io_bucket->wchar += io_curr->wchar - io_last->wchar; - io_bucket->fsync += io_curr->fsync - io_last->fsync; - - io_last->read_bytes = io_curr->read_bytes; - io_last->write_bytes = io_curr->write_bytes; - io_last->rchar = io_curr->rchar; - io_last->wchar = io_curr->wchar; - io_last->fsync = io_curr->fsync; + compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state], + &uid_entry->io[UID_STATE_TOTAL_CURR], + &uid_entry->io[UID_STATE_TOTAL_LAST], + &uid_entry->io[UID_STATE_DEAD_TASKS]); } } -static void update_io_stats_uid_locked(uid_t target_uid) +static void update_io_stats_uid_locked(struct uid_entry *uid_entry) { - struct uid_entry *uid_entry; struct task_struct *task, *temp; - struct io_stats *io_bucket, *io_curr, *io_last; struct user_namespace *user_ns = current_user_ns(); - uid_entry = find_or_register_uid(target_uid); - if (!uid_entry) - return; - memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, sizeof(struct io_stats)); rcu_read_lock(); do_each_thread(temp, task) { - if (from_kuid_munged(user_ns, task_uid(task)) != target_uid) + if (from_kuid_munged(user_ns, task_uid(task)) != uid_entry->uid) continue; - add_uid_io_curr_stats(uid_entry, task); + add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR); } while_each_thread(temp, task); rcu_read_unlock(); - io_bucket = &uid_entry->io[uid_entry->state]; - io_curr = &uid_entry->io[UID_STATE_TOTAL_CURR]; - io_last = &uid_entry->io[UID_STATE_TOTAL_LAST]; - - io_bucket->read_bytes += - io_curr->read_bytes - io_last->read_bytes; - io_bucket->write_bytes += - io_curr->write_bytes - io_last->write_bytes; - io_bucket->rchar += io_curr->rchar - io_last->rchar; - io_bucket->wchar += io_curr->wchar - io_last->wchar; - io_bucket->fsync += io_curr->fsync - io_last->fsync; - - io_last->read_bytes = io_curr->read_bytes; - io_last->write_bytes = io_curr->write_bytes; - io_last->rchar = io_curr->rchar; - io_last->wchar = io_curr->wchar; - io_last->fsync = io_curr->fsync; + compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state], + &uid_entry->io[UID_STATE_TOTAL_CURR], + &uid_entry->io[UID_STATE_TOTAL_LAST], + &uid_entry->io[UID_STATE_DEAD_TASKS]); } static int uid_io_show(struct seq_file *m, void *v) @@ -405,7 +383,7 @@ static ssize_t uid_procstat_write(struct file *file, return count; } - update_io_stats_uid_locked(uid); + update_io_stats_uid_locked(uid_entry); uid_entry->state = state; @@ -443,8 +421,7 @@ static int process_notifier(struct notifier_block *self, uid_entry->utime += utime; uid_entry->stime += stime; - update_io_stats_uid_locked(uid); - clean_uid_io_last_stats(uid_entry, task); + add_uid_io_stats(uid_entry, task, UID_STATE_DEAD_TASKS); exit: rt_mutex_unlock(&uid_lock); From d73d07673edbdbe78e1a7d00e7827ba9bfd86a59 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 29 May 2017 16:38:16 -0700 Subject: [PATCH 0807/3220] ANDROID: mnt: Fix next_descendent next_descendent did not properly handle the case where the initial mount had no slaves. In this case, we would look for the next slave, but since don't have a master, the check for wrapping around to the start of the list will always fail. Instead, we check for this case, and ensure that we end the iteration when we come back to the root. Signed-off-by: Daniel Rosenberg Bug: 62094374 Change-Id: I43dfcee041aa3730cb4b9a1161418974ef84812e --- fs/pnode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/pnode.c b/fs/pnode.c index b5f97c605d98..e4e428d621e9 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -504,9 +504,14 @@ static struct mount *next_descendent(struct mount *root, struct mount *cur) if (!IS_MNT_NEW(cur) && !list_empty(&cur->mnt_slave_list)) return first_slave(cur); do { - if (cur->mnt_slave.next != &cur->mnt_master->mnt_slave_list) - return next_slave(cur); - cur = cur->mnt_master; + struct mount *master = cur->mnt_master; + + if (!master || cur->mnt_slave.next != &master->mnt_slave_list) { + struct mount *next = next_slave(cur); + + return (next == root) ? NULL : next; + } + cur = master; } while (cur != root); return NULL; } From 7d16e880c62547936b431cde966d17e39e6e92e0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 8 May 2017 10:21:34 -0700 Subject: [PATCH 0808/3220] Revert "ext4: require encryption feature for EXT4_IOC_SET_ENCRYPTION_POLICY" This reverts commit e2968fb8e7980dccc199dac2593ad476db20969f. For various reasons, we've had to start enforcing upstream that ext4 encryption can only be used if the filesystem superblock has the EXT4_FEATURE_INCOMPAT_ENCRYPT flag set, as was the intended design. Unfortunately, Android isn't ready for this quite yet, since its userspace still needs to be updated to set the flag at mkfs time, or else fix it later with tune2fs. It will need some more time to be fixed properly, so for now to avoid breaking some devices, revert the kernel change. Bug: 36231741 Signed-off-by: Eric Biggers Change-Id: I30bd54afb68dbaf9801f8954099dffa90a2f8df1 --- fs/ext4/ioctl.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c21826be1cb3..3a2594665b44 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -626,9 +626,6 @@ resizefs_out: struct ext4_encryption_policy policy; int err = 0; - if (!ext4_has_feature_encrypt(sb)) - return -EOPNOTSUPP; - if (copy_from_user(&policy, (struct ext4_encryption_policy __user *)arg, sizeof(policy))) { From df6e5af086d3834986af1f6e63f572edde9bc5ee Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 30 May 2017 14:46:26 -0700 Subject: [PATCH 0809/3220] ANDROID: hid: uhid: implement refcount for open and close Fix concurrent open and close activity sending a UHID_CLOSE while some consumers still have the device open. Temporary solution for reference counts on device open and close calls, absent a facility for this in the HID core likely to appear in the future. [toddpoynor@google.com: commit text] Bug: 38448648 Signed-off-by: Todd Poynor Signed-off-by: Dmitry Torokhov Change-Id: I57413e42ec961a960a8ddc4942228df22c730d80 --- drivers/hid/uhid.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index 1a2032c2c1fb..690a9f0fa042 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -28,6 +28,8 @@ #define UHID_NAME "uhid" #define UHID_BUFSIZE 32 +static DEFINE_MUTEX(uhid_open_mutex); + struct uhid_device { struct mutex devlock; bool running; @@ -142,15 +144,26 @@ static void uhid_hid_stop(struct hid_device *hid) static int uhid_hid_open(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; + int retval = 0; - return uhid_queue_event(uhid, UHID_OPEN); + mutex_lock(&uhid_open_mutex); + if (!hid->open++) { + retval = uhid_queue_event(uhid, UHID_OPEN); + if (retval) + hid->open--; + } + mutex_unlock(&uhid_open_mutex); + return retval; } static void uhid_hid_close(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; - uhid_queue_event(uhid, UHID_CLOSE); + mutex_lock(&uhid_open_mutex); + if (!--hid->open) + uhid_queue_event(uhid, UHID_CLOSE); + mutex_unlock(&uhid_open_mutex); } static int uhid_hid_parse(struct hid_device *hid) From 5df19f969e4834548c4cf930c591581e237fccd0 Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Wed, 24 May 2017 10:28:27 +0800 Subject: [PATCH 0810/3220] ANDROID: Kconfig: add depends for UID_SYS_STATS uid_io depends on TASK_XACCT and TASK_IO_ACCOUNTING. So add depends in Kconfig before compiling code. Change-Id: Ie6bf57ec7c2eceffadf4da0fc2aca001ce10c36e Signed-off-by: Ganesh Mahendran --- drivers/misc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 5847d3be0835..9e649992c177 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -527,7 +527,7 @@ config VEXPRESS_SYSCFG config UID_SYS_STATS bool "Per-UID statistics" - depends on PROFILING + depends on PROFILING && TASK_XACCT && TASK_IO_ACCOUNTING help Per UID based cpu time statistics exported to /proc/uid_cputime Per UID based io statistics exported to /proc/uid_io From f02702dcf231c3258aabd702023286f6c01aaa21 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Fri, 11 Nov 2016 14:04:43 -0800 Subject: [PATCH 0811/3220] sched: backport cpufreq hooks from 4.9-rc4 The scheduler cpufreq hooks are required by the schedutil cpufreq governor. Change-Id: Ied6c46262bb33b7e81bbb3d3d2761124e0c676b7 Signed-off-by: Steve Muckle [trivial cherry-picking fixes] Signed-off-by: Juri Lelli Signed-off-by: Chris Redpath --- include/linux/sched.h | 17 +++++++++++ kernel/sched/Makefile | 1 + kernel/sched/cpufreq.c | 63 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/deadline.c | 3 ++ kernel/sched/fair.c | 59 +++++++++++++++++++++++++++++++++----- kernel/sched/rt.c | 3 ++ kernel/sched/sched.h | 52 ++++++++++++++++++++++++++++++++++ 7 files changed, 191 insertions(+), 7 deletions(-) create mode 100644 kernel/sched/cpufreq.c diff --git a/include/linux/sched.h b/include/linux/sched.h index 79f70e5bae08..bc23d15244db 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3291,4 +3291,21 @@ static inline unsigned long rlimit_max(unsigned int limit) return task_rlimit_max(current, limit); } +#define SCHED_CPUFREQ_RT (1U << 0) +#define SCHED_CPUFREQ_DL (1U << 1) +#define SCHED_CPUFREQ_IOWAIT (1U << 2) + +#define SCHED_CPUFREQ_RT_DL (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL) + +#ifdef CONFIG_CPU_FREQ +struct update_util_data { + void (*func)(struct update_util_data *data, u64 time, unsigned int flags); +}; + +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, + void (*func)(struct update_util_data *data, u64 time, + unsigned int flags)); +void cpufreq_remove_update_util_hook(int cpu); +#endif /* CONFIG_CPU_FREQ */ + #endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 623ce4bde0d5..f2d49474aab1 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -21,4 +21,5 @@ obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..dbc51442ecbc --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,63 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * @func: Callback function to set for the CPU. + * + * Set and publish the update_util_data pointer for the given CPU. + * + * The update_util_data pointer of @cpu is set to @data and the callback + * function pointer in the target struct update_util_data is set to @func. + * That function will be called by cpufreq_update_util() from RCU-sched + * read-side critical sections, so it must not sleep. @data will always be + * passed to it as the first argument which allows the function to get to the + * target update_util_data structure and its container. + * + * The update_util_data pointer of @cpu must be NULL when this function is + * called or it will WARN() and return with no effect. + */ +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, + void (*func)(struct update_util_data *data, u64 time, + unsigned int flags)) +{ + if (WARN_ON(!data || !func)) + return; + + if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu))) + return; + + data->func = func; + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); + +/** + * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer. + * @cpu: The CPU to clear the pointer for. + * + * Clear the update_util_data pointer for the given CPU. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_remove_update_util_hook(int cpu) +{ + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL); +} +EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f10b1cb255b2..9ec7f19b5020 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -753,6 +753,9 @@ static void update_curr_dl(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6f353de3f390..baba2d34c34e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2711,6 +2711,29 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + if (&this_rq()->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_of(cfs_rq), 0); + } +} + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); /* @@ -2731,10 +2754,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); } while (0) /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, + bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed = 0; + int decayed, removed = 0, removed_util = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); @@ -2747,6 +2771,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); + removed_util = 1; } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -2761,6 +2786,9 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq == &rq_of(cfs_rq)->cfs) trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + if (update_freq && (decayed || removed_util)) + cfs_rq_util_change(cfs_rq); + return decayed || removed; } @@ -2779,7 +2807,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) update_tg_load_avg(cfs_rq, 0); if (entity_is_task(se)) @@ -2811,6 +2839,8 @@ skip_aging: cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + + cfs_rq_util_change(cfs_rq); } static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2823,6 +2853,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + + cfs_rq_util_change(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -2840,7 +2872,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->curr == se, NULL); } - decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; @@ -2940,7 +2972,11 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline void update_load_avg(struct sched_entity *se, int update_tg) +{ + cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); +} + static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void @@ -4257,6 +4293,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_wakeup = flags & ENQUEUE_WAKEUP; #endif + /* + * If in_iowait is set, the code below may not trigger any cpufreq + * utilization updates, so do it here explicitly with the IOWAIT flag + * passed. + */ + if (p->in_iowait) + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); + for_each_sched_entity(se) { if (se->on_rq) break; @@ -6923,7 +6967,8 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, + true)) update_tg_load_avg(cfs_rq, 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -6984,7 +7029,7 @@ static inline void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); raw_spin_unlock_irqrestore(&rq->lock, flags); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 541b8494450e..6bb51d62dca4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1002,6 +1002,9 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0386a2cdb008..237b0bcdd304 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2038,3 +2038,55 @@ static inline void account_reset_rq(struct rq *rq) rq->prev_steal_time_rq = 0; #endif } + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @rq: Runqueue to carry out the update for. + * @flags: Update reason flags. + * + * This function is called by the scheduler on the CPU whose utilization is + * being updated. + * + * It can only be called from RCU-sched read-side critical sections. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, rq_clock(rq), flags); +} + +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) +{ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_update_util(rq, flags); +} +#else +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef arch_scale_freq_capacity +#ifndef arch_scale_freq_invariant +#define arch_scale_freq_invariant() (true) +#endif +#else /* arch_scale_freq_capacity */ +#define arch_scale_freq_invariant() (false) +#endif From 6bc6115c16caeb19d2d946eef8ca00d00c154118 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Fri, 11 Nov 2016 17:14:39 -0800 Subject: [PATCH 0812/3220] BACKPORT: cpufreq: schedutil: New governor based on scheduler utilization data Add a new cpufreq scaling governor, called "schedutil", that uses scheduler-provided CPU utilization information as input for making its decisions. Doing that is possible after commit 34e2c55 (cpufreq: Add mechanism for registering utilization update callbacks) that introduced cpufreq_update_util() called by the scheduler on utilization changes (from CFS) and RT/DL task status updates. In particular, CPU frequency scaling decisions may be based on the the utilization data passed to cpufreq_update_util() by CFS. The new governor is relatively simple. The frequency selection formula used by it depends on whether or not the utilization is frequency-invariant. In the frequency-invariant case the new CPU frequency is given by next_freq = 1.25 * max_freq * util / max where util and max are the last two arguments of cpufreq_update_util(). In turn, if util is not frequency-invariant, the maximum frequency in the above formula is replaced with the current frequency of the CPU: next_freq = 1.25 * curr_freq * util / max The coefficient 1.25 corresponds to the frequency tipping point at (util / max) = 0.8. All of the computations are carried out in the utilization update handlers provided by the new governor. One of those handlers is used for cpufreq policies shared between multiple CPUs and the other one is for policies with one CPU only (and therefore it doesn't need to use any extra synchronization means). The governor supports fast frequency switching if that is supported by the cpufreq driver in use and possible for the given policy. In the fast switching case, all operations of the governor take place in its utilization update handlers. If fast switching cannot be used, the frequency switch operations are carried out with the help of a work item which only calls __cpufreq_driver_target() (under a mutex) to trigger a frequency update (to a value already computed beforehand in one of the utilization update handlers). Currently, the governor treats all of the RT and DL tasks as "unknown utilization" and sets the frequency to the allowed maximum when updated from the RT or DL sched classes. That heavy-handed approach should be replaced with something more subtle and specifically targeted at RT and DL tasks. The governor shares some tunables management code with the "ondemand" and "conservative" governors and uses some common definitions from cpufreq_governor.h, but apart from that it is stand-alone. Change-Id: I03876e622768e4b3ee4dc28682af7cce771f2f4c Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) (cherry-picked from 9bdcb44e391da5c41b98573bf0305a0e0b1c9569) [ Backport the schedutil cpufreq governor from 4.9. Some cpufreq tunable infrastructure as well as the resolve_freq API is also backported as those are dependencies] Signed-off-by: Steve Muckle [trivial cherry-picking fixes] Signed-off-by: Juri Lelli [fixed default governor machinery] Signed-off-by: Chris Redpath --- drivers/cpufreq/Kconfig | 26 + drivers/cpufreq/Makefile | 2 +- drivers/cpufreq/cpufreq.c | 32 ++ drivers/cpufreq/cpufreq_governor_attr_set.c | 84 +++ include/linux/cpufreq.h | 52 ++ kernel/sched/Makefile | 1 + kernel/sched/cpufreq_schedutil.c | 601 ++++++++++++++++++++ 7 files changed, 797 insertions(+), 1 deletion(-) create mode 100644 drivers/cpufreq/cpufreq_governor_attr_set.c create mode 100644 kernel/sched/cpufreq_schedutil.c diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index d43c401ff190..91b05e2b8799 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -120,6 +120,15 @@ config CPU_FREQ_DEFAULT_GOV_SCHED cpu frequency using CPU utilization estimates from the scheduler. +config CPU_FREQ_DEFAULT_GOV_SCHEDUTIL + bool "schedutil" + depends on SMP + select CPU_FREQ_GOV_SCHEDUTIL + select CPU_FREQ_GOV_PERFORMANCE + help + Use the 'schedutil' CPUFreq governor by default. If unsure, + have a look at the help section of that governor. The fallback + governor will be 'performance'. endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -228,6 +237,23 @@ config CPU_FREQ_GOV_SCHED If in doubt, say N. +config CPU_FREQ_GOV_SCHEDUTIL + bool "'schedutil' cpufreq policy governor" + depends on CPU_FREQ && SMP + select CPU_FREQ_GOV_ATTR_SET + select IRQ_WORK + help + This governor makes decisions based on the utilization data provided + by the scheduler. It sets the CPU frequency to be proportional to + the utilization/capacity ratio coming from the scheduler. If the + utilization is frequency-invariant, the new frequency is also + proportional to the maximum available frequency. If that is not the + case, it is proportional to the current frequency of the CPU. The + frequency tipping point is at utilization/capacity equal to 80% in + both cases. + + If in doubt, say N. + comment "CPU frequency scaling drivers" config CPUFREQ_DT diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index b02ae1463a15..04e6324fd638 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -1,5 +1,5 @@ # CPUfreq core -obj-$(CONFIG_CPU_FREQ) += cpufreq.o freq_table.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o freq_table.o cpufreq_governor_attr_set.o # CPUfreq stats obj-$(CONFIG_CPU_FREQ_STAT) += cpufreq_stats.o diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 0ceb0a889ebf..56d12ed4f38c 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -531,6 +531,38 @@ void cpufreq_freq_transition_end(struct cpufreq_policy *policy, } EXPORT_SYMBOL_GPL(cpufreq_freq_transition_end); +/** + * cpufreq_driver_resolve_freq - Map a target frequency to a driver-supported + * one. + * @target_freq: target frequency to resolve. + * + * The target to driver frequency mapping is cached in the policy. + * + * Return: Lowest driver-supported frequency greater than or equal to the + * given target_freq, subject to policy (min/max) and driver limitations. + */ +unsigned int cpufreq_driver_resolve_freq(struct cpufreq_policy *policy, + unsigned int target_freq) +{ + target_freq = clamp_val(target_freq, policy->min, policy->max); + policy->cached_target_freq = target_freq; + + if (cpufreq_driver->target_index) { + int idx, rv; + + rv = cpufreq_frequency_table_target(policy, policy->freq_table, + target_freq, + CPUFREQ_RELATION_L, + &idx); + if (rv) + return target_freq; + policy->cached_resolved_idx = idx; + return policy->freq_table[idx].frequency; + } + + return target_freq; +} +EXPORT_SYMBOL_GPL(cpufreq_driver_resolve_freq); /********************************************************************* * SYSFS INTERFACE * diff --git a/drivers/cpufreq/cpufreq_governor_attr_set.c b/drivers/cpufreq/cpufreq_governor_attr_set.c new file mode 100644 index 000000000000..52841f807a7e --- /dev/null +++ b/drivers/cpufreq/cpufreq_governor_attr_set.c @@ -0,0 +1,84 @@ +/* + * Abstract code for CPUFreq governor tunable sysfs attributes. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "cpufreq_governor.h" + +static inline struct gov_attr_set *to_gov_attr_set(struct kobject *kobj) +{ + return container_of(kobj, struct gov_attr_set, kobj); +} + +static inline struct governor_attr *to_gov_attr(struct attribute *attr) +{ + return container_of(attr, struct governor_attr, attr); +} + +static ssize_t governor_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct governor_attr *gattr = to_gov_attr(attr); + + return gattr->show(to_gov_attr_set(kobj), buf); +} + +static ssize_t governor_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct gov_attr_set *attr_set = to_gov_attr_set(kobj); + struct governor_attr *gattr = to_gov_attr(attr); + int ret; + + mutex_lock(&attr_set->update_lock); + ret = attr_set->usage_count ? gattr->store(attr_set, buf, count) : -EBUSY; + mutex_unlock(&attr_set->update_lock); + return ret; +} + +const struct sysfs_ops governor_sysfs_ops = { + .show = governor_show, + .store = governor_store, +}; +EXPORT_SYMBOL_GPL(governor_sysfs_ops); + +void gov_attr_set_init(struct gov_attr_set *attr_set, struct list_head *list_node) +{ + INIT_LIST_HEAD(&attr_set->policy_list); + mutex_init(&attr_set->update_lock); + attr_set->usage_count = 1; + list_add(list_node, &attr_set->policy_list); +} +EXPORT_SYMBOL_GPL(gov_attr_set_init); + +void gov_attr_set_get(struct gov_attr_set *attr_set, struct list_head *list_node) +{ + mutex_lock(&attr_set->update_lock); + attr_set->usage_count++; + list_add(list_node, &attr_set->policy_list); + mutex_unlock(&attr_set->update_lock); +} +EXPORT_SYMBOL_GPL(gov_attr_set_get); + +unsigned int gov_attr_set_put(struct gov_attr_set *attr_set, struct list_head *list_node) +{ + unsigned int count; + + mutex_lock(&attr_set->update_lock); + list_del(list_node); + count = --attr_set->usage_count; + mutex_unlock(&attr_set->update_lock); + if (count) + return count; + + kobject_put(&attr_set->kobj); + mutex_destroy(&attr_set->update_lock); + return 0; +} +EXPORT_SYMBOL_GPL(gov_attr_set_put); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 60571292a802..b144e7445477 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -107,6 +107,22 @@ struct cpufreq_policy { */ struct rw_semaphore rwsem; + + /* + * Fast switch flags: + * - fast_switch_possible should be set by the driver if it can + * guarantee that frequency can be changed on any CPU sharing the + * policy and that the change will affect all of the policy CPUs then. + * - fast_switch_enabled is to be set by governors that support fast + * freqnency switching with the help of cpufreq_enable_fast_switch(). + */ + bool fast_switch_possible; + bool fast_switch_enabled; + + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ + unsigned int cached_target_freq; + int cached_resolved_idx; + /* Synchronization for frequency transitions */ bool transition_ongoing; /* Tracks transition status */ spinlock_t transition_lock; @@ -471,6 +487,8 @@ int cpufreq_driver_target(struct cpufreq_policy *policy, int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation); +unsigned int cpufreq_driver_resolve_freq(struct cpufreq_policy *policy, + unsigned int target_freq); int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); @@ -502,8 +520,42 @@ extern struct cpufreq_governor cpufreq_gov_interactive; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED) extern struct cpufreq_governor cpufreq_gov_sched; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL) +extern struct cpufreq_governor cpufreq_gov_schedutil; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_schedutil) #endif +static inline void cpufreq_policy_apply_limits(struct cpufreq_policy *policy) +{ + if (policy->max < policy->cur) + __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); + else if (policy->min > policy->cur) + __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); +} + +/* Governor attribute set */ +struct gov_attr_set { + struct kobject kobj; + struct list_head policy_list; + struct mutex update_lock; + int usage_count; +}; + +/* sysfs ops for cpufreq governors */ +extern const struct sysfs_ops governor_sysfs_ops; + +void gov_attr_set_init(struct gov_attr_set *attr_set, struct list_head *list_node); +void gov_attr_set_get(struct gov_attr_set *attr_set, struct list_head *list_node); +unsigned int gov_attr_set_put(struct gov_attr_set *attr_set, struct list_head *list_node); + +/* Governor sysfs attribute */ +struct governor_attr { + struct attribute attr; + ssize_t (*show)(struct gov_attr_set *attr_set, char *buf); + ssize_t (*store)(struct gov_attr_set *attr_set, const char *buf, + size_t count); +}; + /********************************************************************* * FREQUENCY TABLE HELPERS * *********************************************************************/ diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f2d49474aab1..ca0d94096170 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c new file mode 100644 index 000000000000..7e42a1d6d88d --- /dev/null +++ b/kernel/sched/cpufreq_schedutil.c @@ -0,0 +1,601 @@ +/* + * CPUFreq governor based on scheduler-provided CPU utilization data. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +#include "sched.h" + +/* Stub out fast switch routines present on mainline to reduce the backport + * overhead. */ +#define cpufreq_driver_fast_switch(x, y) 0 +#define cpufreq_enable_fast_switch(x) +#define cpufreq_disable_fast_switch(x) + +struct sugov_tunables { + struct gov_attr_set attr_set; + unsigned int rate_limit_us; +}; + +struct sugov_policy { + struct cpufreq_policy *policy; + + struct sugov_tunables *tunables; + struct list_head tunables_hook; + + raw_spinlock_t update_lock; /* For shared policies */ + u64 last_freq_update_time; + s64 freq_update_delay_ns; + unsigned int next_freq; + + /* The next fields are only needed if fast switch cannot be used. */ + struct irq_work irq_work; + struct work_struct work; + struct mutex work_lock; + bool work_in_progress; + + bool need_freq_update; +}; + +struct sugov_cpu { + struct update_util_data update_util; + struct sugov_policy *sg_policy; + + unsigned int cached_raw_freq; + unsigned long iowait_boost; + unsigned long iowait_boost_max; + u64 last_update; + + /* The fields below are only needed when sharing a policy. */ + unsigned long util; + unsigned long max; + unsigned int flags; +}; + +static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); + +/************************ Governor internals ***********************/ + +static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) +{ + s64 delta_ns; + + if (sg_policy->work_in_progress) + return false; + + if (unlikely(sg_policy->need_freq_update)) { + sg_policy->need_freq_update = false; + /* + * This happens when limits change, so forget the previous + * next_freq value and force an update. + */ + sg_policy->next_freq = UINT_MAX; + return true; + } + + delta_ns = time - sg_policy->last_freq_update_time; + return delta_ns >= sg_policy->freq_update_delay_ns; +} + +static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = sg_policy->policy; + + sg_policy->last_freq_update_time = time; + + if (policy->fast_switch_enabled) { + if (sg_policy->next_freq == next_freq) { + trace_cpu_frequency(policy->cur, smp_processor_id()); + return; + } + sg_policy->next_freq = next_freq; + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (next_freq == CPUFREQ_ENTRY_INVALID) + return; + + policy->cur = next_freq; + trace_cpu_frequency(next_freq, smp_processor_id()); + } else if (sg_policy->next_freq != next_freq) { + sg_policy->next_freq = next_freq; + sg_policy->work_in_progress = true; + irq_work_queue(&sg_policy->irq_work); + } +} + +/** + * get_next_freq - Compute a new frequency for a given cpufreq policy. + * @sg_cpu: schedutil cpu object to compute the new frequency for. + * @util: Current CPU utilization. + * @max: CPU capacity. + * + * If the utilization is frequency-invariant, choose the new frequency to be + * proportional to it, that is + * + * next_freq = C * max_freq * util / max + * + * Otherwise, approximate the would-be frequency-invariant utilization by + * util_raw * (curr_freq / max_freq) which leads to + * + * next_freq = C * curr_freq * util_raw / max + * + * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. + * + * The lowest driver-supported frequency which is equal or greater than the raw + * next_freq (as calculated above) is returned, subject to policy min/max and + * cpufreq driver limitations. + */ +static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, + unsigned long max) +{ + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int freq = arch_scale_freq_invariant() ? + policy->cpuinfo.max_freq : policy->cur; + + freq = (freq + (freq >> 2)) * util / max; + + if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + return sg_policy->next_freq; + sg_cpu->cached_raw_freq = freq; + return cpufreq_driver_resolve_freq(policy, freq); +} + +static void sugov_get_util(unsigned long *util, unsigned long *max) +{ + struct rq *rq = this_rq(); + unsigned long cfs_max; + + cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); + + *util = min(rq->cfs.avg.util_avg, cfs_max); + *max = cfs_max; +} + +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + unsigned int flags) +{ + if (flags & SCHED_CPUFREQ_IOWAIT) { + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + } else if (sg_cpu->iowait_boost) { + s64 delta_ns = time - sg_cpu->last_update; + + /* Clear iowait_boost if the CPU apprears to have been idle. */ + if (delta_ns > TICK_NSEC) + sg_cpu->iowait_boost = 0; + } +} + +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, + unsigned long *max) +{ + unsigned long boost_util = sg_cpu->iowait_boost; + unsigned long boost_max = sg_cpu->iowait_boost_max; + + if (!boost_util) + return; + + if (*util * boost_max < *max * boost_util) { + *util = boost_util; + *max = boost_max; + } + sg_cpu->iowait_boost >>= 1; +} + +static void sugov_update_single(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned long util, max; + unsigned int next_f; + + sugov_set_iowait_boost(sg_cpu, time, flags); + sg_cpu->last_update = time; + + if (!sugov_should_update_freq(sg_policy, time)) + return; + + if (flags & SCHED_CPUFREQ_RT_DL) { + next_f = policy->cpuinfo.max_freq; + } else { + sugov_get_util(&util, &max); + sugov_iowait_boost(sg_cpu, &util, &max); + next_f = get_next_freq(sg_cpu, util, max); + } + sugov_update_commit(sg_policy, time, next_f); +} + +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, + unsigned long util, unsigned long max, + unsigned int flags) +{ + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int max_f = policy->cpuinfo.max_freq; + u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned int j; + + if (flags & SCHED_CPUFREQ_RT_DL) + return max_f; + + sugov_iowait_boost(sg_cpu, &util, &max); + + for_each_cpu(j, policy->cpus) { + struct sugov_cpu *j_sg_cpu; + unsigned long j_util, j_max; + s64 delta_ns; + + if (j == smp_processor_id()) + continue; + + j_sg_cpu = &per_cpu(sugov_cpu, j); + /* + * If the CPU utilization was last updated before the previous + * frequency update and the time elapsed between the last update + * of the CPU utilization and the last frequency update is long + * enough, don't take the CPU into account as it probably is + * idle now (and clear iowait_boost for it). + */ + delta_ns = last_freq_update_time - j_sg_cpu->last_update; + if (delta_ns > TICK_NSEC) { + j_sg_cpu->iowait_boost = 0; + continue; + } + if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) + return max_f; + + j_util = j_sg_cpu->util; + j_max = j_sg_cpu->max; + if (j_util * max > j_max * util) { + util = j_util; + max = j_max; + } + + sugov_iowait_boost(j_sg_cpu, &util, &max); + } + + return get_next_freq(sg_cpu, util, max); +} + +static void sugov_update_shared(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned long util, max; + unsigned int next_f; + + sugov_get_util(&util, &max); + + raw_spin_lock(&sg_policy->update_lock); + + sg_cpu->util = util; + sg_cpu->max = max; + sg_cpu->flags = flags; + + sugov_set_iowait_boost(sg_cpu, time, flags); + sg_cpu->last_update = time; + + if (sugov_should_update_freq(sg_policy, time)) { + next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); + sugov_update_commit(sg_policy, time, next_f); + } + + raw_spin_unlock(&sg_policy->update_lock); +} + +static void sugov_work(struct work_struct *work) +{ + struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + + mutex_lock(&sg_policy->work_lock); + __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, + CPUFREQ_RELATION_L); + mutex_unlock(&sg_policy->work_lock); + + sg_policy->work_in_progress = false; +} + +static void sugov_irq_work(struct irq_work *irq_work) +{ + struct sugov_policy *sg_policy; + + sg_policy = container_of(irq_work, struct sugov_policy, irq_work); + schedule_work_on(smp_processor_id(), &sg_policy->work); +} + +/************************** sysfs interface ************************/ + +static struct sugov_tunables *global_tunables; +static DEFINE_MUTEX(global_tunables_lock); + +static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) +{ + return container_of(attr_set, struct sugov_tunables, attr_set); +} + +static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->rate_limit_us); +} + +static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, + size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + struct sugov_policy *sg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->rate_limit_us = rate_limit_us; + + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) + sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; + + return count; +} + +static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); + +static struct attribute *sugov_attributes[] = { + &rate_limit_us.attr, + NULL +}; + +static struct kobj_type sugov_tunables_ktype = { + .default_attrs = sugov_attributes, + .sysfs_ops = &governor_sysfs_ops, +}; + +/********************** cpufreq governor interface *********************/ +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL +static +#endif +struct cpufreq_governor cpufreq_gov_schedutil; + +static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + + sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); + if (!sg_policy) + return NULL; + + sg_policy->policy = policy; + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + INIT_WORK(&sg_policy->work, sugov_work); + mutex_init(&sg_policy->work_lock); + raw_spin_lock_init(&sg_policy->update_lock); + return sg_policy; +} + +static void sugov_policy_free(struct sugov_policy *sg_policy) +{ + mutex_destroy(&sg_policy->work_lock); + kfree(sg_policy); +} + +static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) +{ + struct sugov_tunables *tunables; + + tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (tunables) { + gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); + if (!have_governor_per_policy()) + global_tunables = tunables; + } + return tunables; +} + +static void sugov_tunables_free(struct sugov_tunables *tunables) +{ + if (!have_governor_per_policy()) + global_tunables = NULL; + + kfree(tunables); +} + +static int sugov_init(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + struct sugov_tunables *tunables; + unsigned int lat; + int ret = 0; + + /* State should be equivalent to EXIT */ + if (policy->governor_data) + return -EBUSY; + + sg_policy = sugov_policy_alloc(policy); + if (!sg_policy) + return -ENOMEM; + + mutex_lock(&global_tunables_lock); + + if (global_tunables) { + if (WARN_ON(have_governor_per_policy())) { + ret = -EINVAL; + goto free_sg_policy; + } + policy->governor_data = sg_policy; + sg_policy->tunables = global_tunables; + + gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); + goto out; + } + + tunables = sugov_tunables_alloc(sg_policy); + if (!tunables) { + ret = -ENOMEM; + goto free_sg_policy; + } + + tunables->rate_limit_us = 20 * USEC_PER_MSEC; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) + tunables->rate_limit_us *= lat; + + policy->governor_data = sg_policy; + sg_policy->tunables = tunables; + + ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, + get_governor_parent_kobj(policy), "%s", + cpufreq_gov_schedutil.name); + if (ret) + goto fail; + + out: + mutex_unlock(&global_tunables_lock); + + cpufreq_enable_fast_switch(policy); + return 0; + + fail: + policy->governor_data = NULL; + sugov_tunables_free(tunables); + + free_sg_policy: + mutex_unlock(&global_tunables_lock); + + sugov_policy_free(sg_policy); + pr_err("initialization failed (error %d)\n", ret); + return ret; +} + +static int sugov_exit(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + struct sugov_tunables *tunables = sg_policy->tunables; + unsigned int count; + + cpufreq_disable_fast_switch(policy); + + mutex_lock(&global_tunables_lock); + + count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); + policy->governor_data = NULL; + if (!count) + sugov_tunables_free(tunables); + + mutex_unlock(&global_tunables_lock); + + sugov_policy_free(sg_policy); + + return 0; +} + +static int sugov_start(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; + sg_policy->last_freq_update_time = 0; + sg_policy->next_freq = UINT_MAX; + sg_policy->work_in_progress = false; + sg_policy->need_freq_update = false; + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + + sg_cpu->sg_policy = sg_policy; + if (policy_is_shared(policy)) { + sg_cpu->util = 0; + sg_cpu->max = 0; + sg_cpu->flags = SCHED_CPUFREQ_RT; + sg_cpu->last_update = 0; + sg_cpu->cached_raw_freq = 0; + sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + sugov_update_shared); + } else { + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + sugov_update_single); + } + } + return 0; +} + +static int sugov_stop(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + for_each_cpu(cpu, policy->cpus) + cpufreq_remove_update_util_hook(cpu); + + synchronize_sched(); + + irq_work_sync(&sg_policy->irq_work); + cancel_work_sync(&sg_policy->work); + + return 0; +} + +static int sugov_limits(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + + if (!policy->fast_switch_enabled) { + mutex_lock(&sg_policy->work_lock); + cpufreq_policy_apply_limits(policy); + mutex_unlock(&sg_policy->work_lock); + } + + sg_policy->need_freq_update = true; + + return 0; +} + +static int cpufreq_schedutil_cb(struct cpufreq_policy *policy, + unsigned int event) +{ + switch(event) { + case CPUFREQ_GOV_POLICY_INIT: + return sugov_init(policy); + case CPUFREQ_GOV_POLICY_EXIT: + return sugov_exit(policy); + case CPUFREQ_GOV_START: + return sugov_start(policy); + case CPUFREQ_GOV_STOP: + return sugov_stop(policy); + case CPUFREQ_GOV_LIMITS: + return sugov_limits(policy); + default: + BUG(); + } +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL +static +#endif +struct cpufreq_governor cpufreq_gov_schedutil = { + .name = "schedutil", + .governor = cpufreq_schedutil_cb, + .owner = THIS_MODULE, +}; + +static int __init sugov_register(void) +{ + return cpufreq_register_governor(&cpufreq_gov_schedutil); +} +fs_initcall(sugov_register); From ca7b7d3c9995a12d57a7e7df6bb567a63b6cad00 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Mon, 24 Oct 2016 18:22:19 -0700 Subject: [PATCH 0813/3220] sched/cpufreq: fix tunables for schedfreq governor The schedfreq governor does not currently handle cpufreq drivers which use a global set of tunables (!have_governor_per_policy). For example on x86 and using the acpi cpufreq driver, doing this cat /sys/devices/system/cpu/cpufreq/sched/up_throttle_nsec will result in a bad pointer access. Update the tunable code using the upstream schedutil tunable code by Rafael Wysocki as a guide. Includes a partial backport of the reorganized cpufreq tunable infrastructure. Change-Id: I7e6f8de1dac297077ad43f37dd2f6ddbfe921c98 Signed-off-by: Steve Muckle [fixed cherry-pick issue] Signed-off-by: Juri Lelli [fixed cherry-pick issue] Signed-off-by: Thierry Strudel --- kernel/sched/cpufreq_sched.c | 218 +++++++++++++++++++---------------- 1 file changed, 116 insertions(+), 102 deletions(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index d751bc2d0d6e..f10d9f7d6d07 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -32,6 +32,12 @@ static struct cpufreq_governor cpufreq_gov_sched; static DEFINE_PER_CPU(unsigned long, enabled); DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); +struct gov_tunables { + struct gov_attr_set attr_set; + unsigned int up_throttle_nsec; + unsigned int down_throttle_nsec; +}; + /** * gov_data - per-policy data internal to the governor * @up_throttle: next throttling period expiry if increasing OPP @@ -53,8 +59,8 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); struct gov_data { ktime_t up_throttle; ktime_t down_throttle; - unsigned int up_throttle_nsec; - unsigned int down_throttle_nsec; + struct gov_tunables *tunables; + struct list_head tunables_hook; struct task_struct *task; struct irq_work irq_work; unsigned int requested_freq; @@ -71,8 +77,10 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); - gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec); - gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec); + gd->up_throttle = ktime_add_ns(ktime_get(), + gd->tunables->up_throttle_nsec); + gd->down_throttle = ktime_add_ns(ktime_get(), + gd->tunables->down_throttle_nsec); up_write(&policy->rwsem); } @@ -262,12 +270,70 @@ static inline void clear_sched_freq(void) static_key_slow_dec(&__sched_freq); } -static struct attribute_group sched_attr_group_gov_pol; -static struct attribute_group *get_sysfs_attr(void) +/* Tunables */ +static struct gov_tunables *global_tunables; + +static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set) { - return &sched_attr_group_gov_pol; + return container_of(attr_set, struct gov_tunables, attr_set); } +static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf) +{ + struct gov_tunables *tunables = to_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->up_throttle_nsec); +} + +static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct gov_tunables *tunables = to_tunables(attr_set); + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + tunables->up_throttle_nsec = val; + return count; +} + +static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf) +{ + struct gov_tunables *tunables = to_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->down_throttle_nsec); +} + +static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct gov_tunables *tunables = to_tunables(attr_set); + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + tunables->down_throttle_nsec = val; + return count; +} + +static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec); +static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec); + +static struct attribute *schedfreq_attributes[] = { + &up_throttle_nsec.attr, + &down_throttle_nsec.attr, + NULL +}; + +static struct kobj_type tunables_ktype = { + .default_attrs = schedfreq_attributes, + .sysfs_ops = &governor_sysfs_ops, +}; + static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) { struct gov_data *gd; @@ -282,17 +348,39 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) if (!gd) return -ENOMEM; - gd->up_throttle_nsec = policy->cpuinfo.transition_latency ? - policy->cpuinfo.transition_latency : - THROTTLE_UP_NSEC; - gd->down_throttle_nsec = THROTTLE_DOWN_NSEC; - pr_debug("%s: throttle threshold = %u [ns]\n", - __func__, gd->up_throttle_nsec); + policy->governor_data = gd; - rc = sysfs_create_group(&policy->kobj, get_sysfs_attr()); - if (rc) { - pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); - goto err; + if (!global_tunables) { + gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL); + if (!gd->tunables) + goto free_gd; + + gd->tunables->up_throttle_nsec = + policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_UP_NSEC; + gd->tunables->down_throttle_nsec = + THROTTLE_DOWN_NSEC; + + rc = kobject_init_and_add(&gd->tunables->attr_set.kobj, + &tunables_ktype, + get_governor_parent_kobj(policy), + "%s", cpufreq_gov_sched.name); + if (rc) + goto free_tunables; + + gov_attr_set_init(&gd->tunables->attr_set, + &gd->tunables_hook); + + pr_debug("%s: throttle_threshold = %u [ns]\n", + __func__, gd->tunables->up_throttle_nsec); + + if (!have_governor_per_policy()) + global_tunables = gd->tunables; + } else { + gd->tunables = global_tunables; + gov_attr_set_get(&global_tunables->attr_set, + &gd->tunables_hook); } policy->governor_data = gd; @@ -304,7 +392,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) if (IS_ERR_OR_NULL(gd->task)) { pr_err("%s: failed to create kschedfreq thread\n", __func__); - goto err; + goto free_tunables; } get_task_struct(gd->task); kthread_bind_mask(gd->task, policy->related_cpus); @@ -316,7 +404,9 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) return 0; -err: +free_tunables: + kfree(gd->tunables); +free_gd: policy->governor_data = NULL; kfree(gd); return -ENOMEM; @@ -324,6 +414,7 @@ err: static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) { + unsigned int count; struct gov_data *gd = policy->governor_data; clear_sched_freq(); @@ -332,7 +423,12 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) put_task_struct(gd->task); } - sysfs_remove_group(&policy->kobj, get_sysfs_attr()); + count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook); + if (!count) { + if (!have_governor_per_policy()) + global_tunables = NULL; + kfree(gd->tunables); + } policy->governor_data = NULL; @@ -394,88 +490,6 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy, return 0; } -/* Tunables */ -static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf) -{ - return sprintf(buf, "%u\n", gd->up_throttle_nsec); -} - -static ssize_t store_up_throttle_nsec(struct gov_data *gd, - const char *buf, size_t count) -{ - int ret; - long unsigned int val; - - ret = kstrtoul(buf, 0, &val); - if (ret < 0) - return ret; - gd->up_throttle_nsec = val; - return count; -} - -static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf) -{ - return sprintf(buf, "%u\n", gd->down_throttle_nsec); -} - -static ssize_t store_down_throttle_nsec(struct gov_data *gd, - const char *buf, size_t count) -{ - int ret; - long unsigned int val; - - ret = kstrtoul(buf, 0, &val); - if (ret < 0) - return ret; - gd->down_throttle_nsec = val; - return count; -} - -/* - * Create show/store routines - * - sys: One governor instance for complete SYSTEM - * - pol: One governor instance per struct cpufreq_policy - */ -#define show_gov_pol_sys(file_name) \ -static ssize_t show_##file_name##_gov_pol \ -(struct cpufreq_policy *policy, char *buf) \ -{ \ - return show_##file_name(policy->governor_data, buf); \ -} - -#define store_gov_pol_sys(file_name) \ -static ssize_t store_##file_name##_gov_pol \ -(struct cpufreq_policy *policy, const char *buf, size_t count) \ -{ \ - return store_##file_name(policy->governor_data, buf, count); \ -} - -#define gov_pol_attr_rw(_name) \ - static struct freq_attr _name##_gov_pol = \ - __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol) - -#define show_store_gov_pol_sys(file_name) \ - show_gov_pol_sys(file_name); \ - store_gov_pol_sys(file_name) -#define tunable_handlers(file_name) \ - show_gov_pol_sys(file_name); \ - store_gov_pol_sys(file_name); \ - gov_pol_attr_rw(file_name) - -tunable_handlers(down_throttle_nsec); -tunable_handlers(up_throttle_nsec); - -/* Per policy governor instance */ -static struct attribute *sched_attributes_gov_pol[] = { - &up_throttle_nsec_gov_pol.attr, - &down_throttle_nsec_gov_pol.attr, - NULL, -}; - -static struct attribute_group sched_attr_group_gov_pol = { - .attrs = sched_attributes_gov_pol, - .name = "sched", -}; #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED static From 78213729a7d12af48a7953dd0c69a8d9b1b39e44 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:43 -0700 Subject: [PATCH 0814/3220] BACKPORT: kthread: allow to cancel kthread work We are going to use kthread workers more widely and sometimes we will need to make sure that the work is neither pending nor running. This patch implements cancel_*_sync() operations as inspired by workqueues. Well, we are synchronized against the other operations via the worker lock, we use del_timer_sync() and a counter to count parallel cancel operations. Therefore the implementation might be easier. First, we check if a worker is assigned. If not, the work has newer been queued after it was initialized. Second, we take the worker lock. It must be the right one. The work must not be assigned to another worker unless it is initialized in between. Third, we try to cancel the timer when it exists. The timer is deleted synchronously to make sure that the timer call back is not running. We need to temporary release the worker->lock to avoid a possible deadlock with the callback. In the meantime, we set work->canceling counter to avoid any queuing. Fourth, we try to remove the work from a worker list. It might be the list of either normal or delayed works. Fifth, if the work is running, we call kthread_flush_work(). It might take an arbitrary time. We need to release the worker-lock again. In the meantime, we again block any queuing by the canceling counter. As already mentioned, the check for a pending kthread work is done under a lock. In compare with workqueues, we do not need to fight for a single PENDING bit to block other operations. Therefore we do not suffer from the thundering storm problem and all parallel canceling jobs might use kthread_flush_work(). Any queuing is blocked until the counter gets zero. Change-Id: I8a8ece0f93c828f311d0ad5c88d80db2388e4808 Link: http://lkml.kernel.org/r/1470754545-17632-10-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry-picked from 37be45d49dec2a411e29d50c9597cfe8184b5645) [major changes to the original patch while cherry-picking; only rebased the sync variant] Signed-off-by: Juri Lelli --- include/linux/kthread.h | 4 ++ kernel/kthread.c | 96 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index e691b6a23f72..4289343ba1f9 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -75,6 +75,8 @@ struct kthread_work { struct list_head node; kthread_work_func_t func; struct kthread_worker *worker; + /* Number of canceling calls that are running at the moment. */ + int canceling; }; #define KTHREAD_WORKER_INIT(worker) { \ @@ -129,4 +131,6 @@ bool queue_kthread_work(struct kthread_worker *worker, void flush_kthread_work(struct kthread_work *work); void flush_kthread_worker(struct kthread_worker *worker); +bool kthread_cancel_work_sync(struct kthread_work *work); + #endif /* _LINUX_KTHREAD_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 850b255649a2..698b8dec3074 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -604,6 +604,19 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); +/* + * Returns true when the work could not be queued at the moment. + * It happens when it is already pending in a worker list + * or when it is being cancelled. + */ +static inline bool queuing_blocked(struct kthread_worker *worker, + struct kthread_work *work) +{ + lockdep_assert_held(&worker->lock); + + return !list_empty(&work->node) || work->canceling; +} + /* insert @work before @pos in @worker */ static void insert_kthread_work(struct kthread_worker *worker, struct kthread_work *work, @@ -633,7 +646,7 @@ bool queue_kthread_work(struct kthread_worker *worker, unsigned long flags; spin_lock_irqsave(&worker->lock, flags); - if (list_empty(&work->node)) { + if (!queuing_blocked(worker, work)) { insert_kthread_work(worker, work, &worker->work_list); ret = true; } @@ -694,6 +707,87 @@ retry: } EXPORT_SYMBOL_GPL(flush_kthread_work); +/* + * This function removes the work from the worker queue. Also it makes sure + * that it won't get queued later via the delayed work's timer. + * + * The work might still be in use when this function finishes. See the + * current_work proceed by the worker. + * + * Return: %true if @work was pending and successfully canceled, + * %false if @work was not pending + */ +static bool __kthread_cancel_work(struct kthread_work *work, + unsigned long *flags) +{ + /* + * Try to remove the work from a worker list. It might either + * be from worker->work_list or from worker->delayed_work_list. + */ + if (!list_empty(&work->node)) { + list_del_init(&work->node); + return true; + } + + return false; +} + +static bool __kthread_cancel_work_sync(struct kthread_work *work) +{ + struct kthread_worker *worker = work->worker; + unsigned long flags; + int ret = false; + + if (!worker) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); + + ret = __kthread_cancel_work(work, &flags); + + if (worker->current_work != work) + goto out_fast; + + /* + * The work is in progress and we need to wait with the lock released. + * In the meantime, block any queuing by setting the canceling counter. + */ + work->canceling++; + spin_unlock_irqrestore(&worker->lock, flags); + flush_kthread_work(work); + spin_lock_irqsave(&worker->lock, flags); + work->canceling--; + +out_fast: + spin_unlock_irqrestore(&worker->lock, flags); +out: + return ret; +} + +/** + * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish + * @work: the kthread work to cancel + * + * Cancel @work and wait for its execution to finish. This function + * can be used even if the work re-queues itself. On return from this + * function, @work is guaranteed to be not pending or executing on any CPU. + * + * kthread_cancel_work_sync(&delayed_work->work) must not be used for + * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. + * + * The caller must ensure that the worker on which @work was last + * queued can't be destroyed before this function returns. + * + * Return: %true if @work was pending, %false otherwise. + */ +bool kthread_cancel_work_sync(struct kthread_work *work) +{ + return __kthread_cancel_work_sync(work); +} +EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); + /** * flush_kthread_worker - flush all current works on a kthread_worker * @worker: worker to flush From e2aa75a4c7812700d41ee6b45d3d85e0773805bc Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 13:53:22 +0530 Subject: [PATCH 0815/3220] cpufreq: schedutil: move slow path from workqueue to SCHED_FIFO task If slow path frequency changes are conducted in a SCHED_OTHER context then they may be delayed for some amount of time, including indefinitely, when real time or deadline activity is taking place. Move the slow path to a real time kernel thread. In the future the thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set to 50 for now. Hackbench results on ARM Exynos, dual core A15 platform for 10 iterations: $ hackbench -s 100 -l 100 -g 10 -f 20 Before After --------------------------------- 1.808 1.603 1.847 1.251 2.229 1.590 1.952 1.600 1.947 1.257 1.925 1.627 2.694 1.620 1.258 1.621 1.919 1.632 1.250 1.240 Average: 1.8829 1.5041 Based on initial work by Steve Muckle. Change-Id: I8f53037e94f353960c6d10abf07822d671631ef7 Signed-off-by: Steve Muckle Signed-off-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki (cherry picked from 02a7b1ee3baa) [adapt to the 3.18 kthread interface] Signed-off-by: Juri Lelli --- kernel/sched/cpufreq_schedutil.c | 86 +++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 8 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7e42a1d6d88d..784142ef9924 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include @@ -22,6 +23,7 @@ #define cpufreq_driver_fast_switch(x, y) 0 #define cpufreq_enable_fast_switch(x) #define cpufreq_disable_fast_switch(x) +#define SUGOV_KTHREAD_PRIORITY 50 struct sugov_tunables { struct gov_attr_set attr_set; @@ -41,8 +43,10 @@ struct sugov_policy { /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; - struct work_struct work; + struct kthread_work work; struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; bool work_in_progress; bool need_freq_update; @@ -297,7 +301,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, raw_spin_unlock(&sg_policy->update_lock); } -static void sugov_work(struct work_struct *work) +static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); @@ -314,7 +318,21 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy; sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - schedule_work_on(smp_processor_id(), &sg_policy->work); + + /* + * For Real Time and Deadline tasks, schedutil governor shoots the + * frequency to maximum. And special care must be taken to ensure that + * this kthread doesn't result in that. + * + * This is (mostly) guaranteed by the work_in_progress flag. The flag is + * updated only at the end of the sugov_work() and before that schedutil + * rejects all other frequency scaling requests. + * + * Though there is a very rare case where the RT thread yields right + * after the work_in_progress flag is cleared. The effects of that are + * neglected for now. + */ + queue_kthread_work(&sg_policy->worker, &sg_policy->work); } /************************** sysfs interface ************************/ @@ -380,7 +398,6 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) sg_policy->policy = policy; init_irq_work(&sg_policy->irq_work, sugov_irq_work); - INIT_WORK(&sg_policy->work, sugov_work); mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; @@ -392,6 +409,51 @@ static void sugov_policy_free(struct sugov_policy *sg_policy) kfree(sg_policy); } +static int sugov_kthread_create(struct sugov_policy *sg_policy) +{ + struct task_struct *thread; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; + + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return 0; + + init_kthread_work(&sg_policy->work, sugov_work); + init_kthread_worker(&sg_policy->worker); + thread = kthread_create(kthread_worker_fn, &sg_policy->worker, + "sugov:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + return PTR_ERR(thread); + } + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return ret; + } + + sg_policy->thread = thread; + kthread_bind_mask(thread, policy->related_cpus); + wake_up_process(thread); + + return 0; +} + +static void sugov_kthread_stop(struct sugov_policy *sg_policy) +{ + /* kthread only required for slow path */ + if (sg_policy->policy->fast_switch_enabled) + return; + + flush_kthread_worker(&sg_policy->worker); + kthread_stop(sg_policy->thread); +} + static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) { struct sugov_tunables *tunables; @@ -428,12 +490,16 @@ static int sugov_init(struct cpufreq_policy *policy) if (!sg_policy) return -ENOMEM; + ret = sugov_kthread_create(sg_policy); + if (ret) + goto free_sg_policy; + mutex_lock(&global_tunables_lock); if (global_tunables) { if (WARN_ON(have_governor_per_policy())) { ret = -EINVAL; - goto free_sg_policy; + goto stop_kthread; } policy->governor_data = sg_policy; sg_policy->tunables = global_tunables; @@ -445,7 +511,7 @@ static int sugov_init(struct cpufreq_policy *policy) tunables = sugov_tunables_alloc(sg_policy); if (!tunables) { ret = -ENOMEM; - goto free_sg_policy; + goto stop_kthread; } tunables->rate_limit_us = 20 * USEC_PER_MSEC; @@ -472,7 +538,10 @@ static int sugov_init(struct cpufreq_policy *policy) policy->governor_data = NULL; sugov_tunables_free(tunables); - free_sg_policy: +stop_kthread: + sugov_kthread_stop(sg_policy); + +free_sg_policy: mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); @@ -497,6 +566,7 @@ static int sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); + sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); return 0; @@ -546,7 +616,7 @@ static int sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); irq_work_sync(&sg_policy->irq_work); - cancel_work_sync(&sg_policy->work); + kthread_cancel_work_sync(&sg_policy->work); return 0; } From e5da6c11b20544e7afd11b252ee94721caf1f740 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Thu, 25 Aug 2016 15:59:17 -0700 Subject: [PATCH 0816/3220] sched: cpufreq: use rt_avg as estimate of required RT CPU capacity A policy of going to fmax on any RT activity will be detrimental for power on many platforms. Often RT accounts for only a small amount of CPU activity so sending the CPU frequency to fmax is overkill. Worse still, some platforms may not be able to even complete the CPU frequency change before the RT activity has already completed. Cpufreq governors have not treated RT activity this way in the past so it is not part of the expected semantics of the RT scheduling class. The DL class offers guarantees about task completion and could be used for this purpose. Modify the schedutil algorithm to instead use rt_avg as an estimate of RT utilization of the CPU. Based on previous work by Vincent Guittot . Change-Id: I1ed605a3e2512a94d34217a8e57c3fd97cca60be Signed-off-by: Steve Muckle --- include/linux/sched.h | 2 -- kernel/sched/cpufreq_schedutil.c | 33 ++++++++++++++++++++------------ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index bc23d15244db..dc3da585bdad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3295,8 +3295,6 @@ static inline unsigned long rlimit_max(unsigned int limit) #define SCHED_CPUFREQ_DL (1U << 1) #define SCHED_CPUFREQ_IOWAIT (1U << 2) -#define SCHED_CPUFREQ_RT_DL (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL) - #ifdef CONFIG_CPU_FREQ struct update_util_data { void (*func)(struct update_util_data *data, u64 time, unsigned int flags); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 784142ef9924..033da8f815da 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -156,15 +156,24 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(unsigned long *util, unsigned long *max) +static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) { - struct rq *rq = this_rq(); - unsigned long cfs_max; + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + unsigned long max_cap, rt; + s64 delta; - cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); + max_cap = arch_scale_cpu_capacity(NULL, cpu); - *util = min(rq->cfs.avg.util_avg, cfs_max); - *max = cfs_max; + sched_avg_update(rq); + delta = time - rq->age_stamp; + if (unlikely(delta < 0)) + delta = 0; + rt = div64_u64(rq->rt_avg, sched_avg_period() + delta); + rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT; + + *util = min(rq->cfs.avg.util_avg + rt, max_cap); + *max = max_cap; } static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, @@ -212,10 +221,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; - if (flags & SCHED_CPUFREQ_RT_DL) { + if (flags & SCHED_CPUFREQ_DL) { next_f = policy->cpuinfo.max_freq; } else { - sugov_get_util(&util, &max); + sugov_get_util(&util, &max, time); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_cpu, util, max); } @@ -232,7 +241,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 last_freq_update_time = sg_policy->last_freq_update_time; unsigned int j; - if (flags & SCHED_CPUFREQ_RT_DL) + if (flags & SCHED_CPUFREQ_DL) return max_f; sugov_iowait_boost(sg_cpu, &util, &max); @@ -258,7 +267,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, j_sg_cpu->iowait_boost = 0; continue; } - if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) + if (j_sg_cpu->flags & SCHED_CPUFREQ_DL) return max_f; j_util = j_sg_cpu->util; @@ -282,7 +291,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, unsigned long util, max; unsigned int next_f; - sugov_get_util(&util, &max); + sugov_get_util(&util, &max, time); raw_spin_lock(&sg_policy->update_lock); @@ -590,7 +599,7 @@ static int sugov_start(struct cpufreq_policy *policy) if (policy_is_shared(policy)) { sg_cpu->util = 0; sg_cpu->max = 0; - sg_cpu->flags = SCHED_CPUFREQ_RT; + sg_cpu->flags = SCHED_CPUFREQ_DL; sg_cpu->last_update = 0; sg_cpu->cached_raw_freq = 0; sg_cpu->iowait_boost = 0; From f71d9f01c6fc165ba38cdab6cbb2e4443bd7e458 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 14 Dec 2016 16:10:10 +0000 Subject: [PATCH 0817/3220] sched/cpufreq: make schedutil use WALT signal If WALT is available and enabled, make schedutil governor use its utilization signal. Change-Id: I92bc37989447a76616e9bcc4e9e8616774fb9925 Signed-off-by: Juri Lelli [we need to use boosted_cpu_util for schedutil, so make it not static] Signed-off-by: Chris Redpath --- kernel/sched/cpufreq_schedutil.c | 9 +++++++++ kernel/sched/fair.c | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 033da8f815da..e9a9ee98daf7 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -17,6 +17,11 @@ #include #include "sched.h" +#include "tune.h" + +#ifdef CONFIG_SCHED_WALT +unsigned long boosted_cpu_util(int cpu); +#endif /* Stub out fast switch routines present on mainline to reduce the backport * overhead. */ @@ -173,6 +178,10 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT; *util = min(rq->cfs.avg.util_avg + rt, max_cap); +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + *util = boosted_cpu_util(cpu); +#endif *max = max_cap; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index baba2d34c34e..abe49df1e6f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4258,7 +4258,7 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static bool cpu_overutilized(int cpu); -static inline unsigned long boosted_cpu_util(int cpu); +unsigned long boosted_cpu_util(int cpu); #else #define boosted_cpu_util(cpu) cpu_util(cpu) #endif @@ -5478,7 +5478,7 @@ schedtune_task_margin(struct task_struct *task) #endif /* CONFIG_SCHED_TUNE */ -static inline unsigned long +unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); From 3c5c4e972472fc527d6b027fcdfa4d88444bff52 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 30 Nov 2016 11:09:42 +0000 Subject: [PATCH 0818/3220] trace/sched: add rq utilization signal for WALT It is useful to be able to check current capacity against rq utilization signal generated by WALT (to check how a cpufreq governor is behaving for example). Add rq utilization signal (same scale as capacity) to the walt_update_ task_ravg tracepoint. Change-Id: I9aae3884a741d23ac494bef80d2303f107f135ce Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c18d8c89bd12..1d758d18a1b7 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -984,6 +984,7 @@ TRACE_EVENT(walt_update_task_ravg, __field( int, cpu ) __field( u64, cs ) __field( u64, ps ) + __field(unsigned long, util ) __field( u32, curr_window ) __field( u32, prev_window ) __field( u64, nt_cs ) @@ -1008,6 +1009,8 @@ TRACE_EVENT(walt_update_task_ravg, __entry->irqtime = irqtime; __entry->cs = rq->curr_runnable_sum; __entry->ps = rq->prev_runnable_sum; + __entry->util = rq->prev_runnable_sum << SCHED_LOAD_SHIFT; + do_div(__entry->util, walt_ravg_window); __entry->curr_window = p->ravg.curr_window; __entry->prev_window = p->ravg.prev_window; __entry->nt_cs = rq->nt_curr_runnable_sum; @@ -1016,16 +1019,15 @@ TRACE_EVENT(walt_update_task_ravg, ), TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu" - " cs %llu ps %llu cur_window %u prev_window %u nt_cs %llu nt_ps %llu active_wins %u" + " cs %llu ps %llu util %lu cur_window %u prev_window %u active_wins %u" , __entry->wallclock, __entry->win_start, __entry->delta, __entry->evt, __entry->cpu, __entry->cur_freq, __entry->cur_pid, __entry->pid, __entry->comm, __entry->mark_start, __entry->delta_m, __entry->demand, __entry->sum, __entry->irqtime, - __entry->cs, __entry->ps, + __entry->cs, __entry->ps, __entry->util, __entry->curr_window, __entry->prev_window, - __entry->nt_cs, __entry->nt_ps, __entry->active_windows ) ); From 51b20b214f0e6a2b52bae9f9e748c87b1820f9b2 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Thu, 17 Nov 2016 10:48:45 +0530 Subject: [PATCH 0819/3220] cpufreq: schedutil: add up/down frequency transition rate limits The rate-limit tunable in the schedutil governor applies to transitions to both lower and higher frequencies. On several platforms it is not the ideal tunable though, as it is difficult to get best power/performance figures using the same limit in both directions. It is common on mobile platforms with demanding user interfaces to want to increase frequency rapidly for example but decrease slowly. One of the example can be a case where we have short busy periods followed by similar or longer idle periods. If we keep the rate-limit high enough, we will not go to higher frequencies soon enough. On the other hand, if we keep it too low, we will have too many frequency transitions, as we will always reduce the frequency after the busy period. It would be very useful if we can set low rate-limit while increasing the frequency (so that we can respond to the short busy periods quickly) and high rate-limit while decreasing frequency (so that we don't reduce the frequency immediately after the short busy period and that may avoid frequency transitions before the next busy period). Implement separate up/down transition rate limits. Note that the governor avoids frequency recalculations for a period equal to minimum of up and down rate-limit. A global mutex is also defined to protect updates to min_rate_limit_us via two separate sysfs files. Note that this wouldn't change behavior of the schedutil governor for the platforms which wish to keep same values for both up and down rate limits. This is tested with the rt-app [1] on ARM Exynos, dual A15 processor platform. Testcase: Run a SCHED_OTHER thread on CPU0 which will emulate work-load for X ms of busy period out of the total period of Y ms, i.e. Y - X ms of idle period. The values of X/Y taken were: 20/40, 20/50, 20/70, i.e idle periods of 20, 30 and 50 ms respectively. These were tested against values of up/down rate limits as: 10/10 ms and 10/40 ms. For every test we noticed a performance increase of 5-10% with the schedutil governor, which was very much expected. [Viresh]: Simplified user interface and introduced min_rate_limit_us + mutex, rewrote commit log and included test results. [1] https://github.com/scheduler-tools/rt-app/ Change-Id: I18720a83855b196b8e21dcdc8deae79131635b84 Signed-off-by: Steve Muckle Signed-off-by: Viresh Kumar (applied from https://marc.info/?l=linux-kernel&m=147936011103832&w=2) [trivial adaptations] Signed-off-by: Juri Lelli --- kernel/sched/cpufreq_schedutil.c | 107 ++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 16 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e9a9ee98daf7..5c7c52077ad1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -28,11 +28,13 @@ unsigned long boosted_cpu_util(int cpu); #define cpufreq_driver_fast_switch(x, y) 0 #define cpufreq_enable_fast_switch(x) #define cpufreq_disable_fast_switch(x) +#define LATENCY_MULTIPLIER (1000) #define SUGOV_KTHREAD_PRIORITY 50 struct sugov_tunables { struct gov_attr_set attr_set; - unsigned int rate_limit_us; + unsigned int up_rate_limit_us; + unsigned int down_rate_limit_us; }; struct sugov_policy { @@ -43,7 +45,9 @@ struct sugov_policy { raw_spinlock_t update_lock; /* For shared policies */ u64 last_freq_update_time; - s64 freq_update_delay_ns; + s64 min_rate_limit_ns; + s64 up_rate_delay_ns; + s64 down_rate_delay_ns; unsigned int next_freq; /* The next fields are only needed if fast switch cannot be used. */ @@ -94,7 +98,27 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) } delta_ns = time - sg_policy->last_freq_update_time; - return delta_ns >= sg_policy->freq_update_delay_ns; + + /* No need to recalculate next freq for min_rate_limit_us at least */ + return delta_ns >= sg_policy->min_rate_limit_ns; +} + +static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + s64 delta_ns; + + delta_ns = time - sg_policy->last_freq_update_time; + + if (next_freq > sg_policy->next_freq && + delta_ns < sg_policy->up_rate_delay_ns) + return true; + + if (next_freq < sg_policy->next_freq && + delta_ns < sg_policy->down_rate_delay_ns) + return true; + + return false; } static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, @@ -102,6 +126,9 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, { struct cpufreq_policy *policy = sg_policy->policy; + if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) + return; + sg_policy->last_freq_update_time = time; if (policy->fast_switch_enabled) { @@ -363,15 +390,32 @@ static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr return container_of(attr_set, struct sugov_tunables, attr_set); } -static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +static DEFINE_MUTEX(min_rate_lock); + +static void update_min_rate_limit_us(struct sugov_policy *sg_policy) +{ + mutex_lock(&min_rate_lock); + sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns, + sg_policy->down_rate_delay_ns); + mutex_unlock(&min_rate_lock); +} + +static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) { struct sugov_tunables *tunables = to_sugov_tunables(attr_set); - return sprintf(buf, "%u\n", tunables->rate_limit_us); + return sprintf(buf, "%u\n", tunables->up_rate_limit_us); } -static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, - size_t count) +static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->down_rate_limit_us); +} + +static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) { struct sugov_tunables *tunables = to_sugov_tunables(attr_set); struct sugov_policy *sg_policy; @@ -380,18 +424,42 @@ static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *bu if (kstrtouint(buf, 10, &rate_limit_us)) return -EINVAL; - tunables->rate_limit_us = rate_limit_us; + tunables->up_rate_limit_us = rate_limit_us; - list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) - sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) { + sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_us(sg_policy); + } return count; } -static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); +static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + struct sugov_policy *sg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->down_rate_limit_us = rate_limit_us; + + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) { + sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_us(sg_policy); + } + + return count; +} + +static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us); +static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us); static struct attribute *sugov_attributes[] = { - &rate_limit_us.attr, + &up_rate_limit_us.attr, + &down_rate_limit_us.attr, NULL }; @@ -532,10 +600,13 @@ static int sugov_init(struct cpufreq_policy *policy) goto stop_kthread; } - tunables->rate_limit_us = 20 * USEC_PER_MSEC; + tunables->up_rate_limit_us = LATENCY_MULTIPLIER; + tunables->down_rate_limit_us = LATENCY_MULTIPLIER; lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (lat) - tunables->rate_limit_us *= lat; + if (lat) { + tunables->up_rate_limit_us *= lat; + tunables->down_rate_limit_us *= lat; + } policy->governor_data = sg_policy; sg_policy->tunables = tunables; @@ -595,7 +666,11 @@ static int sugov_start(struct cpufreq_policy *policy) struct sugov_policy *sg_policy = policy->governor_data; unsigned int cpu; - sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; + sg_policy->up_rate_delay_ns = + sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC; + sg_policy->down_rate_delay_ns = + sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_us(sg_policy); sg_policy->last_freq_update_time = 0; sg_policy->next_freq = UINT_MAX; sg_policy->work_in_progress = false; From 5c015afebd4dd130ba86c12e8dabceab0f7f3d15 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 21 Feb 2017 10:15:18 +0530 Subject: [PATCH 0820/3220] FROM-LIST: cpufreq: schedutil: Redefine the rate_limit_us tunable The rate_limit_us tunable is intended to reduce the possible overhead from running the schedutil governor. However, that overhead can be divided into two separate parts: the governor computations and the invocation of the scaling driver to set the CPU frequency. The latter is where the real overhead comes from. The former is much less expensive in terms of execution time and running it every time the governor callback is invoked by the scheduler, after rate_limit_us interval has passed since the last frequency update, would not be a problem. For this reason, redefine the rate_limit_us tunable so that it means the minimum time that has to pass between two consecutive invocations of the scaling driver by the schedutil governor (to set the CPU frequency). Change-Id: Iced64116b826c25441ef537c27a3dabfcf81919e Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki [pulled from linux-pm linux-next https://patchwork.kernel.org/patch/9583949/ ] Signed-off-by: Chris Redpath --- kernel/sched/cpufreq_schedutil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5c7c52077ad1..191ba36a9eeb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -129,14 +129,13 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) return; - sg_policy->last_freq_update_time = time; - if (policy->fast_switch_enabled) { if (sg_policy->next_freq == next_freq) { trace_cpu_frequency(policy->cur, smp_processor_id()); return; } sg_policy->next_freq = next_freq; + sg_policy->last_freq_update_time = time; next_freq = cpufreq_driver_fast_switch(policy, next_freq); if (next_freq == CPUFREQ_ENTRY_INVALID) return; @@ -145,6 +144,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, trace_cpu_frequency(next_freq, smp_processor_id()); } else if (sg_policy->next_freq != next_freq) { sg_policy->next_freq = next_freq; + sg_policy->last_freq_update_time = time; sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } From bd6ff3505f2f9c10ab6fdebc70378f5549b0323b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sun, 4 Dec 2016 17:47:53 +0000 Subject: [PATCH 0821/3220] Revert "WIP: sched: Consider spare cpu capacity at task wake-up" This reverts commit 75a9695b619741019363f889c99c97c7bb823797. Change-Id: I846b21f2bdeb0b0ca30ad65683564ed07a429428 Signed-off-by: Dietmar Eggemann [ minor merge changes ] Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index abe49df1e6f0..12bb2ae026f5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5509,10 +5509,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL, *spare_group = NULL; + struct sched_group *fit_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; unsigned long fit_capacity = ULONG_MAX; - unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -5520,7 +5519,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load, spare_capacity; + unsigned long load, avg_load; int local_group; int i; @@ -5552,16 +5551,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, fit_capacity = capacity_of(i); fit_group = group; } - - /* - * Look for group which has most spare capacity on a - * single cpu. - */ - spare_capacity = capacity_of(i) - cpu_util(i); - if (spare_capacity > max_spare_capacity) { - max_spare_capacity = spare_capacity; - spare_group = group; - } } /* Adjust by relative CPU capacity of the group */ @@ -5578,9 +5567,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (fit_group) return fit_group; - if (spare_group) - return spare_group; - if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; From cb88574a6866aaf09ff172d30f2512bd4a0a66ba Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 25 Jan 2017 10:03:36 +0000 Subject: [PATCH 0822/3220] Partial Revert: "WIP: sched: Add cpu capacity awareness to wakeup balancing" Revert the changes in find_idlest_cpu() and find_idlest_group(). Keep the infrastructure bits which are used in following EAS patches. Change-Id: Id516ca5f3e51b9a13db1ebb8de2df3aa25f9679b Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 12bb2ae026f5..2150edce955a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5384,11 +5384,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } -static inline bool task_fits_spare(struct task_struct *p, int cpu) -{ - return __task_fits(p, cpu, cpu_util(cpu)); -} - static bool cpu_overutilized(int cpu) { return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); @@ -5509,9 +5504,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; - unsigned long fit_capacity = ULONG_MAX; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -5542,15 +5535,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; - - /* - * Look for most energy-efficient group that can fit - * that can fit the task. - */ - if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { - fit_capacity = capacity_of(i); - fit_group = group; - } } /* Adjust by relative CPU capacity of the group */ @@ -5564,9 +5548,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } } while (group = group->next, group != sd->groups); - if (fit_group) - return fit_group; - if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; @@ -5587,7 +5568,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (task_fits_spare(p, i)) { + if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -5599,8 +5580,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if (idle_cpu(i) && - (!idle || idle->exit_latency == min_exit_latency) && + } else if ((!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -5609,13 +5589,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if (shallowest_idle_cpu == -1) { - /* - * If we haven't found an idle CPU yet - * pick a non-idle one that can fit the task as - * fallback. - */ - shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); @@ -5943,8 +5916,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || + want_affine = (!wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || energy_aware(); rcu_read_lock(); From 986aa1d498f44463a6f425ce40ddd66059a789e5 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 22 Jun 2016 18:03:12 +0100 Subject: [PATCH 0823/3220] UPSTREAM: sched/core: Fix power to capacity renaming in comment It is seems that this one escaped Nico's renaming of cpu_power to cpu_capacity a while back. Change-Id: Ic2569d714db7b740f1df4ccc381ba8c1772c2793 Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: linux-kernel@vger.kernel.org Cc: mgalbraith@suse.de Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1466615004-3503-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit bd425d4bfc7a1a6064dbbadfbac9c7eec0e426ec) Signed-off-by: Chris Redpath --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index dc3da585bdad..f8096788cb1b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1003,7 +1003,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ From 3bb3d7e7d9588334b586084f32495f448745f343 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 22 Jun 2016 18:03:13 +0100 Subject: [PATCH 0824/3220] BACKPORT: sched/fair: Make the use of prev_cpu consistent in the wakeup path In commit: ac66f5477239 ("sched/numa: Introduce migrate_swap()") select_task_rq() got a 'cpu' argument to enable overriding of prev_cpu in special cases (NUMA task swapping). However, the select_task_rq_fair() helper functions: wake_affine() and select_idle_sibling(), still use task_cpu(p) directly to work out prev_cpu, which leads to inconsistencies. This patch passes prev_cpu (potentially overridden by NUMA code) into the helper functions to ensure prev_cpu is indeed the same CPU everywhere in the wakeup path. Change-Id: I4951c4eead2e6045e4fb34e89f6cda17d881d4d7 cc: Ingo Molnar cc: Rik van Riel Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: linux-kernel@vger.kernel.org Cc: mgalbraith@suse.de Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1466615004-3503-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 772bd008cd9a1d4e8ce566f2edcc61d1c28fcbe5) [merged with Android/EAS wakeup path] Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2150edce955a..9b64a0e74da7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -670,7 +670,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -1404,7 +1404,8 @@ balance: * Call select_idle_sibling to maybe find a better one. */ if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); assign: assigned = true; @@ -5280,18 +5281,18 @@ static int wake_wide(struct task_struct *p) return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -5605,11 +5606,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; struct sched_group *sg; - int i = task_cpu(p); int best_idle = -1; int best_idle_cstate = -1; int best_idle_capacity = INT_MAX; @@ -5621,8 +5621,8 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + return prev; } /* @@ -5632,6 +5632,7 @@ static int select_idle_sibling(struct task_struct *p, int target) for_each_lower_domain(sd) { sg = sd->groups; do { + int i; if (!cpumask_intersects(sched_group_cpus(sg), tsk_cpus_allowed(p))) goto next; @@ -5942,7 +5943,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } @@ -5950,7 +5951,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else while (sd) { struct sched_group *group; From bc7c939b3a3adcf2cd15e33677ed9246d77e0074 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 22 Jun 2016 18:03:14 +0100 Subject: [PATCH 0825/3220] UPSTREAM: sched/fair: Optimize find_idlest_cpu() when there is no choice In the current find_idlest_group()/find_idlest_cpu() search we end up calling find_idlest_cpu() in a sched_group containing only one CPU in the end. Checking idle-states becomes pointless when there is no alternative, so bail out instead. Change-Id: Ic62bf09b53a7984143ac2431aaa69c69b204cd56 Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: linux-kernel@vger.kernel.org Cc: mgalbraith@suse.de Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1466615004-3503-4-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit eaecf41f5abf80b63c8e025fcb9ee4aa203c3038) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b64a0e74da7..5294c5f47939 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5567,6 +5567,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { if (idle_cpu(i)) { From 3e9cdd5ae95a950c9cd3fc55a7f549ebed47aba6 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:21 +0100 Subject: [PATCH 0826/3220] UPSTREAM: sched/core: Remove unnecessary NULL-pointer check Checking if the sched_domain pointer returned by sd_init() is NULL seems pointless as sd_init() neither checks if it is valid to begin with nor set it to NULL. Change-Id: I5e16fd0c2ca7234b097be7c95409ddb15c5e9de9 Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 0e6d2a67a41321b3ef650b780a279a37855de08e) Signed-off-by: Chris Redpath --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1df6da0094f0..8c5778053a20 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7275,8 +7275,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, struct sched_domain *child, int cpu) { struct sched_domain *sd = sd_init(tl, cpu); - if (!sd) - return child; cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { From 68a3b157d92cd802f284ceec44fd26c3b3cec86d Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:22 +0100 Subject: [PATCH 0827/3220] UPSTREAM: sched/core: Introduce SD_ASYM_CPUCAPACITY sched_domain topology flag Add a topology flag to the sched_domain hierarchy indicating the lowest domain level where the full range of CPU capacities is represented by the domain members for asymmetric capacity topologies (e.g. ARM big.LITTLE). The flag is intended to indicate that extra care should be taken when placing tasks on CPUs and this level spans all the different types of CPUs found in the system (no need to look further up the domain hierarchy). This information is currently only available through iterating through the capacities of all the CPUs at parent levels in the sched_domain hierarchy. SD 2 [ 0 1 2 3] SD_ASYM_CPUCAPACITY SD 1 [ 0 1] [ 2 3] !SD_ASYM_CPUCAPACITY CPU: 0 1 2 3 capacity: 756 756 1024 1024 If the topology in the example above is duplicated to create an eight CPU example with third sched_domain level on top (SD 3), this level should not have the flag set (!SD_ASYM_CPUCAPACITY) as its two group would both have all CPU capacities represented within them. Change-Id: I1526407b90567cac387419719b7d7fdc8b259a85 Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-6-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 1f6e6c7cb9bcd58abb5ee11243e0eefe6b36fc8e) [trivial merge conflict] Signed-off-by: Chris Redpath --- include/linux/sched.h | 1 + kernel/sched/core.c | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f8096788cb1b..436f36f768c6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1003,6 +1003,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ +#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ #define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8c5778053a20..1294c3cacab0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6037,6 +6037,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | SD_SHARE_CAP_STATES)) { @@ -6068,6 +6069,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | @@ -6749,11 +6751,19 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain - * SD_SHARE_CAP_STATES - describes shared capacity states + * These flags are purely descriptive of the topology and do not prescribe + * behaviour. Behaviour is artificial and mapped in the below sd_init() + * function: + * + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies + * SD_SHARE_CAP_STATES - describes shared capacity states + * + * Odd one out, which beside describing the topology has a quirk also + * prescribes the desired behaviour that goes along with it: * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -6763,6 +6773,7 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN | \ SD_SHARE_CAP_STATES) From e51c05769490eede15d97e38b2a470230eea056b Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:23 +0100 Subject: [PATCH 0828/3220] UPSTREAM: sched/core: Pass child domain into sd_init() If behavioural sched_domain flags depend on topology flags set at higher domain levels we need a way to update the child domain flags. Moving the child pointer assignment inside sd_init() should make that possible. Change-Id: If043921fdf102c310adcc9e0280afa33c48c4783 Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 3676b13e8524c576825fe1e731e347dba0083888) Signed-off-by: Chris Redpath --- kernel/sched/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1294c3cacab0..4dbabf491aab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6778,7 +6778,8 @@ static int sched_domains_curr_level; SD_SHARE_CAP_STATES) static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, + struct sched_domain *child, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); int sd_weight, sd_flags = 0; @@ -6830,6 +6831,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, + .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, #endif @@ -7285,14 +7287,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu); + struct sched_domain *sd = sd_init(tl, child, cpu); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; - sd->child = child; if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { From 5173a85e22f97d5a4573b6c45aefc3ee93ae908b Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:24 +0100 Subject: [PATCH 0829/3220] UPSTREAM: sched/core: Enable SD_BALANCE_WAKE for asymmetric capacity systems A domain with the SD_ASYM_CPUCAPACITY flag set indicate that sched_groups at this level and below do not include CPUs of all capacities available (e.g. group containing little-only or big-only CPUs in big.LITTLE systems). It is therefore necessary to put in more effort in finding an appropriate CPU at task wake-up by enabling balancing at wake-up (SD_BALANCE_WAKE) on all lower (child) levels. Change-Id: I4615917f540d03d7e7ef7de8f0da33b1ad97387c Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-8-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 9ee1cda5ee25c7dd82acf25892e0d229e818f8c7) Signed-off-by: Chris Redpath --- kernel/sched/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4dbabf491aab..fcef2588712a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6841,6 +6841,13 @@ sd_init(struct sched_domain_topology_level *tl, * Convert topological properties into behaviour. */ + if (sd->flags & SD_ASYM_CPUCAPACITY) { + struct sched_domain *t = sd; + + for_each_lower_domain(t) + t->flags |= SD_BALANCE_WAKE; + } + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; From abfff9decebfd6b759ce09dcc810a5ed66252b04 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:26 +0100 Subject: [PATCH 0830/3220] UPSTREAM: sched/fair: Let asymmetric CPU configurations balance at wake-up Currently, SD_WAKE_AFFINE always takes priority over wakeup balancing if SD_BALANCE_WAKE is set on the sched_domains. For asymmetric configurations SD_WAKE_AFFINE is only desirable if the waking task's compute demand (utilization) is suitable for the waking CPU and the previous CPU, and all CPUs within their respective SD_SHARE_PKG_RESOURCES domains (sd_llc). If not, let wakeup balancing take over (find_idlest_{group, cpu}()). This patch makes affine wake-ups conditional on whether both the waker CPU and the previous CPU has sufficient capacity for the waking task, or not, assuming that the CPU capacities within an SD_SHARE_PKG_RESOURCES domain (sd_llc) are homogeneous. Change-Id: I6d5d0426713da9ef6198f574ad9afbe58dacc1f0 Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-10-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 3273163c6775c4c21823985304c2364b08ca6ea2) [removed existing definition of capacity_margin] Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 57 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5294c5f47939..19edbc48b1bc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -128,6 +128,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * 1024 < capacity * margin + */ +unsigned int capacity_margin = 1280; /* ~20% */ + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -5358,8 +5364,6 @@ static inline unsigned long task_util(struct task_struct *p) return p->se.avg.util_avg; } -unsigned int capacity_margin = 1280; /* ~20% margin */ - static inline unsigned long boosted_task_util(struct task_struct *task); static inline bool __task_fits(struct task_struct *p, int cpu, int util) @@ -5496,6 +5500,13 @@ boosted_task_util(struct task_struct *task) return util + margin; } +static int cpu_util_wake(int cpu, struct task_struct *p); + +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5899,6 +5910,45 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) return target_cpu; } +/* + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. + */ +static int cpu_util_wake(int cpu, struct task_struct *p) +{ + unsigned long util, capacity; + + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); + + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); + + return (util >= capacity) ? capacity : util; +} + +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ + long min_cap, max_cap; + + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; + + /* Minimum capacity is close to max, no need to abort wake_affine */ + if (max_cap - min_cap < max_cap >> 3) + return 0; + + return min_cap * 1024 < task_util(p) * capacity_margin; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5921,7 +5971,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = (!wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || + want_affine = (!wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + && cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || energy_aware(); rcu_read_lock(); From 68c27298cde48dcdd2714af992879e5b9fc74c5c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Oct 2016 14:41:07 +0100 Subject: [PATCH 0831/3220] UPSTREAM: sched/fair: Compute task/cpu utilization at wake-up correctly At task wake-up load-tracking isn't updated until the task is enqueued. The task's own view of its utilization contribution may therefore not be aligned with its contribution to the cfs_rq load-tracking which may have been updated in the meantime. Basically, the task's own utilization hasn't yet accounted for the sleep decay, while the cfs_rq may have (partially). Estimating the cfs_rq utilization in case the task is migrated at wake-up as task_rq(p)->cfs.avg.util_avg - p->se.avg.util_avg is therefore incorrect as the two load-tracking signals aren't time synchronized (different last update). To solve this problem, this patch synchronizes the task utilization with its previous rq before the task utilization is used in the wake-up path. Currently the update/synchronization is done _after_ the task has been placed by select_task_rq_fair(). The synchronization is done without having to take the rq lock using the existing mechanism used in remove_entity_load_avg(). Change-Id: I5605cca0c94c6ba43d9ce11554765a2456cf85bc Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1476452472-24740-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 104cb16d9eb684f071d5bf3aa87c0d01af259b7c) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 19edbc48b1bc..615e1dcab36f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2924,6 +2924,19 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) } #endif +/* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); +} + /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). @@ -2931,7 +2944,6 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; /* * Newly created task or never used group entity should not be removed @@ -2940,9 +2952,7 @@ void remove_entity_load_avg(struct sched_entity *se) if (se->avg.last_update_time == 0) return; - last_update_time = cfs_rq_last_update_time(cfs_rq); - - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -5946,6 +5956,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) if (max_cap - min_cap < max_cap >> 3) return 0; + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); + return min_cap * 1024 < task_util(p) * capacity_margin; } From f3f132b8e5503e750c2b89de83c3191e58798170 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Oct 2016 14:41:08 +0100 Subject: [PATCH 0832/3220] UPSTREAM: sched/fair: Consider spare capacity in find_idlest_group() In low-utilization scenarios comparing relative loads in find_idlest_group() doesn't always lead to the most optimum choice. Systems with groups containing different numbers of cpus and/or cpus of different compute capacity are significantly better off when considering spare capacity rather than relative load in those scenarios. In addition to existing load based search an alternative spare capacity based candidate sched_group is found and selected instead if sufficient spare capacity exists. If not, existing behaviour is preserved. Change-Id: I6097af76c302a5a12e240ca24c70f707ad118242 Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1476452472-24740-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 6a0b19c0f39a7a7b7fb77d3867a733136ff059a3) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 615e1dcab36f..82eb26bd4361 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5526,7 +5526,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; + struct sched_group *most_spare_sg = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -5534,7 +5536,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, spare_cap, max_spare_cap; int local_group; int i; @@ -5546,8 +5548,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -5557,6 +5563,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + spare_cap = capacity_spare_wake(i, p); + + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ @@ -5564,12 +5575,33 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; + this_spare = max_spare_cap; + } else { + if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; + } } } while (group = group->next, group != sd->groups); + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + */ + if (this_spare > task_util(p) / 2 && + imbalance*this_spare > 100*most_spare) + return NULL; + else if (most_spare > task_util(p) / 2) + return most_spare_sg; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; From 60cc9f4e1e9fa3e951fb5a04d420da6eeb3067ee Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Oct 2016 14:41:09 +0100 Subject: [PATCH 0833/3220] UPSTREAM: sched/fair: Add per-CPU min capacity to sched_group_capacity struct sched_group_capacity currently represents the compute capacity sum of all CPUs in the sched_group. Unless it is divided by the group_weight to get the average capacity per CPU, it hides differences in CPU capacity for mixed capacity systems (e.g. high RT/IRQ utilization or ARM big.LITTLE). But even the average may not be sufficient if the group covers CPUs of different capacities. Instead, by extending struct sched_group_capacity to indicate min per-CPU capacity in the group a suitable group for a given task utilization can more easily be found such that CPUs with reduced capacity can be avoided for tasks with high utilization (not implemented by this patch). Change-Id: If3cae1be62d01a199e752bca5abb45357d5d0fbd Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1476452472-24740-4-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit bf475ce0a3dd75b5d1df6c6c14ae25168caa15ac) Signed-off-by: Chris Redpath --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 7 ++++++- kernel/sched/sched.h | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fcef2588712a..948ef2438606 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6459,6 +6459,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82eb26bd4361..d18e2c7b67e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7268,13 +7268,14 @@ skip_unlock: __attribute__ ((unused)); cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->max_capacity = capacity; + sdg->sgc->min_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, max_capacity; + unsigned long capacity, max_capacity, min_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -7288,6 +7289,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; max_capacity = 0; + min_capacity = ULONG_MAX; if (child->flags & SD_OVERLAP) { /* @@ -7318,6 +7320,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } max_capacity = max(capacity, max_capacity); + min_capacity = min(capacity, min_capacity); } } else { /* @@ -7331,12 +7334,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity += sgc->capacity; max_capacity = max(sgc->max_capacity, max_capacity); + min_capacity = min(sgc->min_capacity, min_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; sdg->sgc->max_capacity = max_capacity; + sdg->sgc->min_capacity = min_capacity; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 237b0bcdd304..6bc2dd623b17 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -899,6 +899,7 @@ struct sched_group_capacity { */ unsigned long capacity; unsigned long max_capacity; /* Max per-cpu capacity in group */ + unsigned long min_capacity; /* Min per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* From adc7f08b2fb617a7b1c30e9dd87e36dcfbe9facb Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Oct 2016 14:41:10 +0100 Subject: [PATCH 0834/3220] UPSTREAM: sched/fair: Avoid pulling tasks from non-overloaded higher capacity groups For asymmetric CPU capacity systems it is counter-productive for throughput if low capacity CPUs are pulling tasks from non-overloaded CPUs with higher capacity. The assumption is that higher CPU capacity is preferred over running alone in a group with lower CPU capacity. This patch rejects higher CPU capacity groups with one or less task per CPU as potential busiest group which could otherwise lead to a series of failing load-balancing attempts leading to a force-migration. Change-Id: I428875bb6267c780026ef75e2882300738d016e7 Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1476452472-24740-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 9e0994c0a1c1f82c705f1f66388e1bcffcee8bb9) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d18e2c7b67e5..c00a8ae48a46 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7569,14 +7569,20 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= busiest->avg_load) return false; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; + /* - * Candiate sg has no more than one task per cpu and has higher - * per-cpu capacity. No reason to pull tasks to less capable cpus. + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && group_smaller_cpu_capacity(sds->local, sg)) return false; +asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; From c6cc7ca9151303d56b519ffcf129c5ecbca9af58 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Oct 2016 14:41:12 +0100 Subject: [PATCH 0835/3220] UPSTREAM: sched/fair: Fix incorrect comment for capacity_margin The comment for capacity_margin introduced in: 3273163c6775 ("sched/fair: Let asymmetric CPU configurations balance at wake-up") ... got its usage the wrong way round - fix it. Change-Id: Ie46eac3e5ff43397b5bed61d0999d2817f1a1d96 Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1476452472-24740-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 893c5d2279041afeb593f1fa8edd9d02edf5b7cb) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c00a8ae48a46..87c03eaacec4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -130,7 +130,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; /* * The margin used when comparing utilization with CPU capacity: - * util * 1024 < capacity * margin + * util * margin < capacity * 1024 */ unsigned int capacity_margin = 1280; /* ~20% */ From 65cad23dcdf7334c785a3c7fd74bdec633156364 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 30 Nov 2016 20:36:31 +0000 Subject: [PATCH 0836/3220] arm64: Set SD_ASYM_CPUCAPACITY sched_domain flag on DIE level Enable Capacity-Aware_scheduling. This is a shortcut from the EAS integration implementation. We know there are SoCs in use which have asymmetric cpu capacities on DIE level. Change-Id: I7f1326e86b8fc08b65d1c1341da7d1ef1013dcbc Signed-off-by: Dietmar Eggemann (cherry picked from commit 82f95a8cdff67770f02b7e4f5b1c084f42afc0e2) Signed-off-by: Chris Redpath --- arch/arm64/kernel/topology.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 5b2c67a510d8..7758f7ff131b 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -258,6 +258,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return &cpu_topology[cpu].core_sibling; } +static int cpu_cpu_flags(void) +{ + return SD_ASYM_CPUCAPACITY; +} + static inline int cpu_corepower_flags(void) { return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ @@ -268,7 +273,7 @@ static struct sched_domain_topology_level arm64_topology[] = { #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, #endif - { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, cpu_cpu_flags, cpu_cluster_energy, SD_INIT_NAME(DIE) }, { NULL, }, }; From 168228463cacf92ee12563ddf612b7489dc3d3fa Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 1 Dec 2016 18:37:58 +0000 Subject: [PATCH 0837/3220] sched/fair: Do not force want_affine eq. true if EAS is enabled This lets us use Capacity-Aware Scheduling (CAS) if EAS is enabled. Change-Id: I2e647a201ea0b733d1487c3e153047a49fb22847 Signed-off-by: Dietmar Eggemann (cherry picked from commit 00b7da2ae58bf568529e67614980f77e275b8d29) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 87c03eaacec4..976e7457b4fa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6016,9 +6016,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = (!wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || - energy_aware(); + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { From 3b6ba235bcf3827d4911f3eb27bae5ac0f4dbbc8 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 26 Jan 2017 16:04:34 +0000 Subject: [PATCH 0838/3220] sched/fair: Decommission energy_aware_wake_cpu() The EAS functionality in the wakeup path will be brought back by the following patch ("sched/fair: Energy-aware wake-up task placement") providing the function select_energy_cpu_brute(). Change-Id: I927fb9e8261cfacfe404695f853941c7959aa146 [ Trivial merge conflicts resolved. ] Signed-off-by: Chris Redpath Signed-off-by: Dietmar Eggemann (cherry picked from commit 80aee424fb7765a777267e144037642625a71304) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 120 +------------------------------------------- 1 file changed, 1 insertion(+), 119 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 976e7457b4fa..ef16d4d3cc00 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5836,122 +5836,6 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre return target_cpu; } -static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) -{ - struct sched_domain *sd; - struct sched_group *sg, *sg_target; - int target_max_cap = INT_MAX; - int target_cpu = task_cpu(p); - unsigned long task_util_boosted, new_util; - int i; - - if (sysctl_sched_sync_hint_enable && sync) { - int cpu = smp_processor_id(); - cpumask_t search_cpus; - cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); - if (cpumask_test_cpu(cpu, &search_cpus)) - return cpu; - } - - sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); - - if (!sd) - return target; - - sg = sd->groups; - sg_target = sg; - - if (sysctl_sched_is_big_little) { - - /* - * Find group with sufficient capacity. We only get here if no cpu is - * overutilized. We may end up overutilizing a cpu by adding the task, - * but that should not be any worse than select_idle_sibling(). - * load_balance() should sort it out later as we get above the tipping - * point. - */ - do { - /* Assuming all cpus are the same in group */ - int max_cap_cpu = group_first_cpu(sg); - - /* - * Assume smaller max capacity means more energy-efficient. - * Ideally we should query the energy model for the right - * answer but it easily ends up in an exhaustive search. - */ - if (capacity_of(max_cap_cpu) < target_max_cap && - task_fits_max(p, max_cap_cpu)) { - sg_target = sg; - target_max_cap = capacity_of(max_cap_cpu); - } - } while (sg = sg->next, sg != sd->groups); - - task_util_boosted = boosted_task_util(p); - /* Find cpu with sufficient capacity */ - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { - /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. - */ - new_util = cpu_util(i) + task_util_boosted; - - /* - * Ensure minimum capacity to grant the required boost. - * The target CPU can be already at a capacity level higher - * than the one required to boost the task. - */ - if (new_util > capacity_orig_of(i)) - continue; - - if (new_util < capacity_curr_of(i)) { - target_cpu = i; - if (cpu_rq(i)->nr_running) - break; - } - - /* cpu has capacity at higher OPP, keep it as fallback */ - if (target_cpu == task_cpu(p)) - target_cpu = i; - } - } else { - /* - * Find a cpu with sufficient capacity - */ -#ifdef CONFIG_CGROUP_SCHEDTUNE - bool boosted = schedtune_task_boost(p) > 0; - bool prefer_idle = schedtune_prefer_idle(p) > 0; -#else - bool boosted = 0; - bool prefer_idle = 0; -#endif - int tmp_target = find_best_target(p, boosted, prefer_idle); - if (tmp_target >= 0) { - target_cpu = tmp_target; - if ((boosted || prefer_idle) && idle_cpu(target_cpu)) - return target_cpu; - } - } - - if (target_cpu != task_cpu(p)) { - struct energy_env eenv = { - .util_delta = task_util(p), - .src_cpu = task_cpu(p), - .dst_cpu = target_cpu, - .task = p, - }; - - /* Not enough spare capacity on previous cpu */ - if (cpu_overutilized(task_cpu(p))) - return target_cpu; - - if (energy_diff(&eenv) >= 0) - return task_cpu(p); - } - - return target_cpu; -} - /* * cpu_util_wake: Compute cpu utilization with any contributions from * the waking task p removed. @@ -6047,9 +5931,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } if (!sd) { - if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) - new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); - else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else while (sd) { From 02cbde61f4d895c25a422813b066d5537f8812fa Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 30 Mar 2016 14:20:12 +0100 Subject: [PATCH 0839/3220] sched/fair: Add energy_diff dead-zone margin It is not worth the overhead to migrate tasks for tiny insignificant energy savings. To prevent this, an energy margin is introduced in energy_diff() which effectively adds a dead-zone that rounds tiny energy differences to zero. Since no scale is enforced for energy model data the margin can't be absolute. Instead it is defined as +/-1.56% energy saving compared to the current total estimated energy consumption. Change-Id: I6be069c752c701fb825430896b3b768a7ab2fee4 Signed-off-by: Morten Rasmussen [rebase: on top of msm-google/android-msm-marlin-3.18, massage original patch which changes code in energy_diff() into __energy_diff() introduced by SchedTune] Signed-off-by: Dietmar Eggemann Signed-off-by: Chris Redpath (cherry picked from commit 780cb5a5fa47adf13d4fc2b77e8e94448cd56098) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef16d4d3cc00..16f15549b5d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5149,6 +5149,7 @@ static inline int __energy_diff(struct energy_env *eenv) struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; + int diff, margin; struct energy_env eenv_before = { .util_delta = 0, @@ -5198,6 +5199,16 @@ static inline int __energy_diff(struct energy_env *eenv) eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); + /* + * Dead-zone margin preventing too many migrations. + */ + + margin = eenv->nrg.before >> 6; /* ~1.56% */ + + diff = eenv->nrg.after - eenv->nrg.before; + + eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff; + return eenv->nrg.diff; } From 3935105f5775e8f37dd06c73e5cc30e5585d673c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 30 Mar 2016 14:29:48 +0100 Subject: [PATCH 0840/3220] sched/fair: Energy-aware wake-up task placement When the systems is not overutilized, place waking tasks on the most energy efficient cpu. Previous attempts reduced the search space by matching task utilization to cpu capacity before consulting the energy model as this is an expensive operation. The search heuristics didn't work very well and lacking any better alternatives this patch takes the brute-force route and tries all potential targets. This approach doesn't scale, but it might be sufficient for many embedded applications while work is continuing on a heuristic that can minimize the necessary computations. The heuristic must be derrived from the platform energy model rather than make additional assumptions, such lower capacity implies better energy efficiency. PeterZ mentioned in the past that we might be able to derrive some simpler deciding functions using mathematical (modal?) analysis. Change-Id: I772bacb4c8fd599f8006fa422f842e66377a9c6c Signed-off-by: Morten Rasmussen [rebase: on top of msm-google/android-msm-marlin-3.18] Signed-off-by: Dietmar Eggemann (cherry picked from commit a894422dbdb7b77ea2acfe7ff909ccb5ded23514) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 16f15549b5d0..c4d9d8bf8d1f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5889,6 +5889,60 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) return min_cap * 1024 < task_util(p) * capacity_margin; } +static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) +{ + int i; + int min_diff = 0, energy_cpu = prev_cpu, spare_cpu = prev_cpu; + unsigned long max_spare = 0; + struct sched_domain *sd; + + rcu_read_lock(); + + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); + + if (!sd) + goto unlock; + + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_domain_span(sd)) { + int diff; + unsigned long spare; + + struct energy_env eenv = { + .util_delta = task_util(p), + .src_cpu = prev_cpu, + .dst_cpu = i, + }; + + spare = capacity_spare_wake(i, p); + + if (i == prev_cpu) + continue; + + if (spare > max_spare) { + max_spare = spare; + spare_cpu = i; + } + + if (spare * 1024 < capacity_margin * task_util(p)) + continue; + + diff = energy_diff(&eenv); + + if (diff < min_diff) { + min_diff = diff; + energy_cpu = i; + } + } + +unlock: + rcu_read_unlock(); + + if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu)) + return prev_cpu; + + return energy_cpu != prev_cpu ? energy_cpu : spare_cpu; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5914,6 +5968,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) + return select_energy_cpu_brute(p, prev_cpu); + rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) From 81bd5ed393832c088e3a31420205bd13e667d538 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sun, 4 Dec 2016 17:29:34 +0000 Subject: [PATCH 0841/3220] Fixup!: sched/fair.c: Set SchedTune specific struct energy_env.task This has to be done in the caller function of energy_diff() version of SchedTune to avoid Null pointer dereference in energy_diff(). Change-Id: I3f0f68dbd11efb15bbb3b1832f8294419ed85241 Signed-off-by: Dietmar Eggemann (cherry picked from commit 14531d4e245d063f713ee5ed835df958e6c7838f) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c4d9d8bf8d1f..ae3cc8df331c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5911,6 +5911,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) .util_delta = task_util(p), .src_cpu = prev_cpu, .dst_cpu = i, + .task = p, }; spare = capacity_spare_wake(i, p); From f6f931489311aa208255192bf22d189d095f5b9a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 5 Dec 2016 14:15:54 +0000 Subject: [PATCH 0842/3220] EAS: sched/fair: Re-integrate 'honor sync wakeups' into wakeup path This patch re-integrates the part which was initially provided by 3b9d7554aeec ("EAS: sched/fair: tunable to honor sync wakeups") into energy_aware_wake_cpu() into select_energy_cpu_brute(). Change-Id: I748fde3ecdeb44651179bce0a5bb8dd82d1903f6 Signed-off-by: Dietmar Eggemann (cherry picked from commit b75b7286cb068d5761621ea134c23dd131db953f) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ae3cc8df331c..5e4bf1e76275 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5889,13 +5889,21 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) return min_cap * 1024 < task_util(p) * capacity_margin; } -static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) +static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) { int i; int min_diff = 0, energy_cpu = prev_cpu, spare_cpu = prev_cpu; unsigned long max_spare = 0; struct sched_domain *sd; + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + cpumask_t search_cpus; + cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); + if (cpumask_test_cpu(cpu, &search_cpus)) + return cpu; + } + rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); @@ -5970,7 +5978,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) - return select_energy_cpu_brute(p, prev_cpu); + return select_energy_cpu_brute(p, prev_cpu, sync); rcu_read_lock(); for_each_domain(cpu, tmp) { From 9e92e8a24fa5bfec70cc70cbfa81e2d34e887634 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 22 Mar 2017 18:16:03 +0000 Subject: [PATCH 0843/3220] sched/fair: Code !is_big_little path into select_energy_cpu_brute() This patch replaces the existing EAS upstream implementation of select_energy_cpu_brute() with the one of find_best_target() used in Android previously. It also removes the cpumask 'and' from select_energy_cpu_brute, see the existing use of 'cpu = smp_processor_id()' in select_task_rq_fair(). Change-Id: If678c002efaa87d1ba3ec9989a4e9f8df98b83ec Signed-off-by: Dietmar Eggemann [ added guarding for non-schedtune builds ] Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 66 ++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e4bf1e76275..387950f2649d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5891,65 +5891,57 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) { - int i; - int min_diff = 0, energy_cpu = prev_cpu, spare_cpu = prev_cpu; - unsigned long max_spare = 0; struct sched_domain *sd; + int target_cpu = prev_cpu, tmp_target; + bool boosted, prefer_idle; if (sysctl_sched_sync_hint_enable && sync) { int cpu = smp_processor_id(); - cpumask_t search_cpus; - cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); - if (cpumask_test_cpu(cpu, &search_cpus)) + + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return cpu; } rcu_read_lock(); +#ifdef CONFIG_CGROUP_SCHEDTUNE + boosted = schedtune_task_boost(p) > 0; + prefer_idle = schedtune_prefer_idle(p) > 0; +#else + boosted = get_sysctl_sched_cfs_boost() > 0; + prefer_idle = 0; +#endif sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); + /* Find a cpu with sufficient capacity */ + tmp_target = find_best_target(p, boosted, prefer_idle); if (!sd) goto unlock; + if (tmp_target >= 0) { + target_cpu = tmp_target; + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) + goto unlock; + } - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_domain_span(sd)) { - int diff; - unsigned long spare; - + if (target_cpu != prev_cpu) { struct energy_env eenv = { - .util_delta = task_util(p), - .src_cpu = prev_cpu, - .dst_cpu = i, - .task = p, + .util_delta = task_util(p), + .src_cpu = prev_cpu, + .dst_cpu = target_cpu, + .task = p, }; - spare = capacity_spare_wake(i, p); + /* Not enough spare capacity on previous cpu */ + if (cpu_overutilized(prev_cpu)) + goto unlock; - if (i == prev_cpu) - continue; - - if (spare > max_spare) { - max_spare = spare; - spare_cpu = i; - } - - if (spare * 1024 < capacity_margin * task_util(p)) - continue; - - diff = energy_diff(&eenv); - - if (diff < min_diff) { - min_diff = diff; - energy_cpu = i; - } + if (energy_diff(&eenv) >= 0) + target_cpu = prev_cpu; } unlock: rcu_read_unlock(); - - if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu)) - return prev_cpu; - - return energy_cpu != prev_cpu ? energy_cpu : spare_cpu; + return target_cpu; } /* From 242695407af782e37d53631a4d41bc213f35cf27 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 13 Jan 2017 13:51:24 +0000 Subject: [PATCH 0844/3220] sched: Remove sysctl_sched_is_big_little With the new wakeup approach this sysctl is not necessary any more. Change-Id: I52114b3c918791f6a4f9f30f50002919ccbc1a9c Signed-off-by: Dietmar Eggemann (cherry picked from commit 885c0d503bcdf0ef4e9b46822496f16b20aa3bbd) Signed-off-by: Chris Redpath --- include/linux/sched/sysctl.h | 1 - kernel/sched/fair.c | 1 - kernel/sysctl.c | 11 ++--------- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d68e88c9d4d7..2bf4520d8089 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,7 +39,6 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; -extern unsigned int sysctl_sched_is_big_little; extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_initial_task_util; extern unsigned int sysctl_sched_cstate_aware; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 387950f2649d..770d0ab957cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -53,7 +53,6 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; -unsigned int sysctl_sched_is_big_little = 0; unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8b0542861c42..8ea768245b92 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -306,8 +306,8 @@ static struct ctl_table kern_table[] = { .extra2 = &max_sched_granularity_ns, }, { - .procname = "sched_is_big_little", - .data = &sysctl_sched_is_big_little, + .procname = "sched_sync_hint_enable", + .data = &sysctl_sched_sync_hint_enable, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, @@ -342,13 +342,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "sched_sync_hint_enable", - .data = &sysctl_sched_sync_hint_enable, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "sched_initial_task_util", .data = &sysctl_sched_initial_task_util, From 3e44a647c057c8ddc7052a221a8968bc55accfd1 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sun, 8 Jan 2017 14:40:38 +0000 Subject: [PATCH 0845/3220] sched/core: Remove remnants of commit fd5c98da1a42 Commit fd5c98da1a42 "WIP: sched: Store system-wide maximum cpu capacity in root domain" was repalced by commit 8148bdfff4f5 "WIP: sched: Update max cpu capacity in case of max frequency constraints" which didn't remove all the now unused bits. Change-Id: I067f6366431f43337cffa7a2a8e0de32dd33d2f9 Signed-off-by: Dietmar Eggemann (cherry picked from commit 6d284a607cec51bcafca313bc396bc3103b1e876) Signed-off-by: Chris Redpath --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 948ef2438606..9efcfb3d0fc1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7332,7 +7332,6 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; - struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -7386,7 +7385,6 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); } From 633b98b6519019e4960a82eb7de07e736d10cca1 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sun, 8 Jan 2017 16:16:59 +0000 Subject: [PATCH 0846/3220] sched/core: Add first cpu w/ max/min orig capacity to root domain This will allow to start iterating from a cpu with max or min original capacity in the wakeup path regardless on which cpu the scheduler is currently running (smp_processor_id()) or the previous cpu of the task (task_cpu(p)). This iteration has to happen on a sched_domain spanning all cpus in the order of the sched_groups of this sched_domain seen by the starting cpu. In case of an SMP system the first cpu with max orig capacity and the the one with min orig capacity is the same. This can temporally happen on a big.LITTLE system with hotplug as well. E.g. the different order of cpu iteration can be used to map schedtune task parameter 'boosted' into the cpu iteration order in find_best_target(). Use of READ_ONCE()/WRITE_ONCE() to avoid load/store tearing. Change-Id: I812fbd9c7e5f506617e456c0eec3edcd2c016e92 Signed-off-by: Dietmar Eggemann (cherry picked from commit fd6e9543c1fd8971a5e2e68e39b2f6e591d46114) Signed-off-by: Chris Redpath --- kernel/sched/core.c | 15 +++++++++++++++ kernel/sched/sched.h | 3 +++ 2 files changed, 18 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9efcfb3d0fc1..495bc41907d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6155,6 +6155,9 @@ static int init_rootdomain(struct root_domain *rd) goto free_rto_mask; init_max_cpu_capacity(&rd->max_cpu_capacity); + + rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1; + return 0; free_rto_mask: @@ -7385,7 +7388,19 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu); + int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu); + + if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig > + cpu_rq(max_cpu)->cpu_capacity_orig)) + WRITE_ONCE(d.rd->max_cap_orig_cpu, i); + + if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig < + cpu_rq(min_cpu)->cpu_capacity_orig)) + WRITE_ONCE(d.rd->min_cap_orig_cpu, i); + sd = *per_cpu_ptr(d.sd, i); + cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6bc2dd623b17..2051fecdb9e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -561,6 +561,9 @@ struct root_domain { /* Maximum cpu capacity in the system. */ struct max_cpu_capacity max_cpu_capacity; + + /* First cpu with maximum and minimum original capacity */ + int max_cap_orig_cpu, min_cap_orig_cpu; }; extern struct root_domain def_root_domain; From d3f5e8c3e9aca6fe9f80a0a70a3f8ef6169ff852 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 13 Jan 2017 17:54:34 +0000 Subject: [PATCH 0847/3220] sched/fair: Change cpu iteration order in find_best_target() The schedtune task parameter 'boosted' is mapped into the cpu iteration order. Currently for 'boosted' equal true the iteration starts at the last cpu (NR_CPUS-1) whereas for 'boosted' equal false it starts at the first cpu (0). This only has the desired effect if the cpu topology oerdering matches the underlying assumption. This e.g. is the case for the Qc snapdragon 821 with its [L0 L1 b0 b1] cpu topology layout (L=lower max freq, b=higher max freq). This results in cpus with higher maximum capacity being given the highest logical cpu ids. However not all big.LITTLE systems enumerate their cpus in the same way. For example, the ARM Versatile Express Juno board has 6 cpus for which the default configuration has topology [L0 b0 b1 L1 L2 L3]. To make this approach independent from the cpu topology layout it now iterates over the cpus in the order of the sched_groups of the EAS sched_domain (sd_ea). The order of cpu iteration is different for the different cpu types in case the cpu is used to dereference sd_ea. Considering the Qc snapdragon 821 again, for cpu L0 and L1 the order is 'b0->b1->L0->L1' whereas for b0 and b1 the order is 'b0->b1->L0->L1'. This approach does not allow the exact same iteration order as with the currently used flat iteration over [0 .. NR_CPUS-1] but the cpus are ordered by the original cpu capacity. The cpu iteration is now done in the sd_ea sched_group order required by the 'boosted' value ['L0->L1->b0->b1'/'b0->b1->L0->L1'] rather than forward/backward over the flat cpu space ['L0->L1->b0->b1'/ 'b1->b0->L1->L0']. Change-Id: I8fbe2073dedd2ecb1c750620c6000c11a5ff4358 Signed-off-by: Dietmar Eggemann (cherry picked from commit a0c6a4272c3968c0ff50d3fed65f5865b72d777b) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 173 +++++++++++++++++++++++++------------------- 1 file changed, 99 insertions(+), 74 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 770d0ab957cc..d9c732de3295 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5743,100 +5743,125 @@ done: return target; } +static int start_cpu(bool boosted) +{ + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + + RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(), + "sched RCU must be held"); + + return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; +} + static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) { - int iter_cpu; int target_cpu = -1; int target_util = 0; int backup_capacity = 0; int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; int backup_cpu = -1; - unsigned long task_util_boosted, new_util; + unsigned long min_util; + unsigned long new_util; + struct sched_domain *sd; + struct sched_group *sg; + int cpu = start_cpu(boosted); - task_util_boosted = boosted_task_util(p); - for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { - int cur_capacity; - struct rq *rq; - int idle_idx; + if (cpu < 0) + return target_cpu; - /* - * Iterate from higher cpus for boosted tasks. - */ - int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + sd = rcu_dereference(per_cpu(sd_ea, cpu)); - if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) - continue; + if (!sd) + return target_cpu; - /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. - */ - new_util = cpu_util(i) + task_util_boosted; + sg = sd->groups; - /* - * Ensure minimum capacity to grant the required boost. - * The target CPU can be already at a capacity level higher - * than the one required to boost the task. - */ - if (new_util > capacity_orig_of(i)) - continue; + min_util = boosted_task_util(p); + + do { + int i; + + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + int cur_capacity; + struct rq *rq; + int idle_idx; + + if (!cpu_online(i)) + continue; + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + new_util = cpu_util(i) + task_util(p); + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + new_util = max(min_util, new_util); + if (new_util > capacity_orig_of(i)) + continue; #ifdef CONFIG_SCHED_WALT - if (walt_cpu_high_irqload(i)) - continue; + if (walt_cpu_high_irqload(i)) + continue; #endif - /* - * Unconditionally favoring tasks that prefer idle cpus to - * improve latency. - */ - if (idle_cpu(i) && prefer_idle) { - if (best_idle_cpu < 0) - best_idle_cpu = i; - continue; - } - cur_capacity = capacity_curr_of(i); - rq = cpu_rq(i); - idle_idx = idle_get_state_idx(rq); - - if (new_util < cur_capacity) { - if (cpu_rq(i)->nr_running) { - if (prefer_idle) { - /* Find a target cpu with highest - * utilization. - */ - if (target_util == 0 || - target_util < new_util) { - target_cpu = i; - target_util = new_util; - } - } else { - /* Find a target cpu with lowest - * utilization. - */ - if (target_util == 0 || - target_util > new_util) { - target_cpu = i; - target_util = new_util; - } - } - } else if (!prefer_idle) { - if (best_idle_cpu < 0 || - (sysctl_sched_cstate_aware && - best_idle_cstate > idle_idx)) { - best_idle_cstate = idle_idx; + /* + * Unconditionally favoring tasks that prefer idle cpus to + * improve latency. + */ + if (idle_cpu(i) && prefer_idle) { + if (best_idle_cpu < 0) best_idle_cpu = i; - } + continue; + } + + cur_capacity = capacity_curr_of(i); + rq = cpu_rq(i); + idle_idx = idle_get_state_idx(rq); + + if (new_util < cur_capacity) { + if (cpu_rq(i)->nr_running) { + if (!prefer_idle) { + /* Find a target cpu with highest + * utilization. + */ + if (target_util == 0 || + target_util < new_util) { + target_cpu = i; + target_util = new_util; + } + } else { + /* Find a target cpu with lowest + * utilization. + */ + if (target_util == 0 || + target_util > new_util) { + target_cpu = i; + target_util = new_util; + } + } + } else if (!prefer_idle) { + if (best_idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + best_idle_cpu = i; + } + } + } else if (backup_capacity == 0 || + backup_capacity > cur_capacity) { + // Find a backup cpu with least capacity. + backup_capacity = cur_capacity; + backup_cpu = i; } - } else if (backup_capacity == 0 || - backup_capacity > cur_capacity) { - // Find a backup cpu with least capacity. - backup_capacity = cur_capacity; - backup_cpu = i; } - } + } while (sg = sg->next, sg != sd->groups); if (prefer_idle && best_idle_cpu >= 0) target_cpu = best_idle_cpu; From b31ae71ef75ee5a33cb96e829a492f2f1c2ce810 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 7 Jan 2017 14:33:51 +0000 Subject: [PATCH 0848/3220] sched/fair: refactor find_best_target() for simplicity Simplify backup_capacity handling and use 'unsigned long' data type for cpu capacity, simplify target_util handling, simplify idle_idx handling & refactor min_util, new_util. Also return first idle cpu for prefer_idle task immediately. Change-Id: Ic89e140f7b369f3965703fdc8463013d16e9b94a Signed-off-by: Dietmar Eggemann Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 60 +++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 40 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d9c732de3295..270091323ae8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5756,13 +5756,12 @@ static int start_cpu(bool boosted) static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) { int target_cpu = -1; - int target_util = 0; - int backup_capacity = 0; + unsigned long target_util = prefer_idle ? ULONG_MAX : 0; + unsigned long backup_capacity = ULONG_MAX; int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; int backup_cpu = -1; - unsigned long min_util; - unsigned long new_util; + unsigned long min_util = boosted_task_util(p); struct sched_domain *sd; struct sched_group *sg; int cpu = start_cpu(boosted); @@ -5777,15 +5776,11 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre sg = sd->groups; - min_util = boosted_task_util(p); - do { int i; for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { - int cur_capacity; - struct rq *rq; - int idle_idx; + unsigned long cur_capacity, new_util; if (!cpu_online(i)) continue; @@ -5803,6 +5798,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre * than the one required to boost the task. */ new_util = max(min_util, new_util); + if (new_util > capacity_orig_of(i)) continue; @@ -5815,38 +5811,25 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre * Unconditionally favoring tasks that prefer idle cpus to * improve latency. */ - if (idle_cpu(i) && prefer_idle) { - if (best_idle_cpu < 0) - best_idle_cpu = i; - continue; - } + if (idle_cpu(i) && prefer_idle) + return i; cur_capacity = capacity_curr_of(i); - rq = cpu_rq(i); - idle_idx = idle_get_state_idx(rq); if (new_util < cur_capacity) { if (cpu_rq(i)->nr_running) { - if (!prefer_idle) { - /* Find a target cpu with highest - * utilization. - */ - if (target_util == 0 || - target_util < new_util) { - target_cpu = i; - target_util = new_util; - } - } else { - /* Find a target cpu with lowest - * utilization. - */ - if (target_util == 0 || - target_util > new_util) { - target_cpu = i; - target_util = new_util; - } + /* + * Find a target cpu with the lowest/highest + * utilization if prefer_idle/!prefer_idle. + */ + if ((prefer_idle && target_util > new_util) || + (!prefer_idle && target_util < new_util)) { + target_util = new_util; + target_cpu = i; } } else if (!prefer_idle) { + int idle_idx = idle_get_state_idx(cpu_rq(i)); + if (best_idle_cpu < 0 || (sysctl_sched_cstate_aware && best_idle_cstate > idle_idx)) { @@ -5854,18 +5837,15 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre best_idle_cpu = i; } } - } else if (backup_capacity == 0 || - backup_capacity > cur_capacity) { - // Find a backup cpu with least capacity. + } else if (backup_capacity > cur_capacity) { + /* Find a backup cpu with least capacity. */ backup_capacity = cur_capacity; backup_cpu = i; } } } while (sg = sg->next, sg != sd->groups); - if (prefer_idle && best_idle_cpu >= 0) - target_cpu = best_idle_cpu; - else if (target_cpu < 0) + if (target_cpu < 0) target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; return target_cpu; From 4e18c8a10de0c4d435dce95e526ecbe97c77d5c5 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 16 Jan 2017 12:42:59 +0000 Subject: [PATCH 0849/3220] sched/fair: Simplify idle_idx handling in select_idle_sibling() Rename best_idle to best_idle_cpu so the same name is used like in find_best_target(). Fix if (best_idle > 0) since best_idle_cpu = 0 is a valid target. Use 'unsigned long' data type for best_idle_capacity. Since we're looking for the shallowest best_idle_cstate initialize best_idle_cstate = INT_MAX. For cpus which are not idle (idle_idx = -1) the condition 'if (idle_idx < best_idle_cstate && ...)' is never executed. Change-Id: Ic5b63d58478696b3d1ec6253cf739a69a574cf99 Signed-off-by: Dietmar Eggemann (cherry picked from commit 8bff5e9c0968108d465e1f2a4624fc5ec2f00849) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 270091323ae8..e77917b23c79 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5677,9 +5677,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; struct sched_group *sg; - int best_idle = -1; - int best_idle_cstate = -1; - int best_idle_capacity = INT_MAX; + int best_idle_cpu = -1; + int best_idle_cstate = INT_MAX; + unsigned long best_idle_capacity = ULONG_MAX; if (!sysctl_sched_cstate_aware) { if (idle_cpu(target)) @@ -5706,18 +5706,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (sysctl_sched_cstate_aware) { for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { - struct rq *rq = cpu_rq(i); - int idle_idx = idle_get_state_idx(rq); + int idle_idx = idle_get_state_idx(cpu_rq(i)); unsigned long new_usage = boosted_task_util(p); unsigned long capacity_orig = capacity_orig_of(i); + if (new_usage > capacity_orig || !idle_cpu(i)) goto next; if (i == target && new_usage <= capacity_curr_of(target)) return target; - if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) { - best_idle = i; + if (idle_idx < best_idle_cstate && + capacity_orig <= best_idle_capacity) { + best_idle_cpu = i; best_idle_cstate = idle_idx; best_idle_capacity = capacity_orig; } @@ -5736,8 +5737,9 @@ next: sg = sg->next; } while (sg != sd->groups); } - if (best_idle > 0) - target = best_idle; + + if (best_idle_cpu >= 0) + target = best_idle_cpu; done: return target; From 9de438d27c43863dabf9db696fecbb90bc5c91eb Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Wed, 30 Mar 2016 04:30:56 +0800 Subject: [PATCH 0850/3220] BACKPORT: sched/fair: Initiate a new task's util avg to a bounded value A new task's util_avg is set to full utilization of a CPU (100% time running). This accelerates a new task's utilization ramp-up, useful to boost its execution in early time. However, it may result in (insanely) high utilization for a transient time period when a flood of tasks are spawned. Importantly, it violates the "fundamentally bounded" CPU utilization, and its side effect is negative if we don't take any measure to bound it. This patch proposes an algorithm to address this issue. It has two methods to approach a sensible initial util_avg: (1) An expected (or average) util_avg based on its cfs_rq's util_avg: util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight (2) A trajectory of how successive new tasks' util develops, which gives 1/2 of the left utilization budget to a new task such that the additional util is noticeably large (when overall util is low) or unnoticeably small (when overall util is high enough). In the meantime, the aggregate utilization is well bounded: util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n where n denotes the nth task. If util_avg is larger than util_avg_cap, then the effective util is clamped to the util_avg_cap. Change-Id: Idafe989b24d9e70911666f09800bf1d5a011e1f4 Reported-by: Andrey Ryabinin Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Link: http://lkml.kernel.org/r/1459283456-21682-1-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 2b8c41daba327c633228169e8bd8ec067ab443f8) [integrate with schedfreq - schedfreq has a tuneable for init task util but this commit removes the use of the tuneable since we have a new algorithm for calculating an initial utilisation. I've left the tuneable in place, but it is no longer used even when schedfreq is the CPUFreq governor] Signed-off-by: Chris Redpath --- kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 73 +++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 1 + 3 files changed, 72 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 495bc41907d6..0daa14cd0d32 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2450,6 +2450,8 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif + /* Post initialize new task's util average when its cfs_rq is set */ + post_init_entity_util_avg(&p->se); rq = __task_rq_lock(p); walt_mark_task_starting(p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e77917b23c79..bb43c742a520 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -701,17 +701,81 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = sched_freq() ? - sysctl_sched_initial_task_util : - scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* + * In previous Android versions, we used to have: + * sa->util_avg = sched_freq() ? + * sysctl_sched_initial_task_util : + * scale_load_down(SCHED_LOAD_SCALE); + * sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + * However, that functionality has been moved to enqueue. + * It is unclear if we should restore this in enqueue. + */ + /* + * At this point, util_avg won't be used in select_task_rq_fair anyway + */ + sa->util_avg = 0; + sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +/* + * With new tasks being created, their initial util_avgs are extrapolated + * based on the cfs_rq's current util_avg: + * + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * + * However, in many cases, the above util_avg does not give a desired + * value. Moreover, the sum of the util_avgs may be divergent, such + * as when the series is a harmonic series. + * + * To solve this problem, we also cap the util_avg of successive tasks to + * only 1/2 of the left utilization budget: + * + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * + * where n denotes the nth task. + * + * For example, a simplest series from the beginning would be like: + * + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... + * + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) + * if util_avg > util_avg_cap. + */ +void post_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + /* + * If we wish to restore tuning via setting initial util, + * this is where we should do it. + */ + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } +} + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { } +void post_init_entity_util_avg(struct sched_entity *se) +{ +} #endif /* @@ -9409,6 +9473,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + post_init_entity_util_avg(se); } return 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2051fecdb9e5..50b9229d47ab 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1391,6 +1391,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct sched_entity *se); static inline void __add_nr_running(struct rq *rq, unsigned count) { From 3fd734a8f9b6703e8f0c6e35280e7907665ef6f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Jun 2016 15:07:50 +0200 Subject: [PATCH 0851/3220] UPSTREAM: sched/fair: Fix post_init_entity_util_avg() serialization Chris Wilson reported a divide by 0 at: post_init_entity_util_avg(): > 725 if (cfs_rq->avg.util_avg != 0) { > 726 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; > -> 727 sa->util_avg /= (cfs_rq->avg.load_avg + 1); > 728 > 729 if (sa->util_avg > cap) > 730 sa->util_avg = cap; > 731 } else { Which given the lack of serialization, and the code generated from update_cfs_rq_load_avg() is entirely possible: if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); removed_load = 1; } turns into: ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax ffffffff8108706b: 48 85 c0 test %rax,%rax ffffffff8108706e: 74 40 je ffffffff810870b0 ffffffff81087070: 4c 89 f8 mov %r15,%rax ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) ffffffff8108707e: 4c 89 f9 mov %r15,%rcx ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx Which you'll note ends up with 'sa->load_avg - r' in memory at ffffffff8108707a. By calling post_init_entity_util_avg() under rq->lock we're sure to be fully serialized against PELT updates and cannot observe intermediate state like this. Change-Id: I56c11886102b7859df82e26c88b1b7c200a39f6e Reported-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrey Ryabinin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: bsegall@google.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded value") Link: http://lkml.kernel.org/r/20160609130750.GQ30909@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar (cherry picked from commit b7fa30c9cc48c4f55663420472505d3b4f6e1705) Signed-off-by: Chris Redpath --- kernel/sched/core.c | 3 +-- kernel/sched/fair.c | 8 +++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0daa14cd0d32..00e969d7b2ad 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2450,10 +2450,9 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Post initialize new task's util average when its cfs_rq is set */ + rq = __task_rq_lock(p); post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p); walt_mark_task_starting(p); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb43c742a520..04dea070fc6e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9444,8 +9444,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -9460,6 +9461,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -9473,7 +9476,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1; From dc1386b6f78755f294f19075ddad6501570a9dee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jun 2016 14:27:50 +0200 Subject: [PATCH 0852/3220] UPSTREAM: sched/fair: Apply more PELT fixes One additional 'rule' for using update_cfs_rq_load_avg() is that one should call update_tg_load_avg() if it returns true. Add a bunch of comments to hopefully clarify some of the rules: o You need to update cfs_rq _before_ any entity attach/detach, this is important, because while for mathmatical consisency this isn't strictly needed, it is required for the physical interpretation of the model, you attach/detach _now_. o When you modify the cfs_rq avg, you have to then call update_tg_load_avg() in order to propagate changes upwards. o (Fair) entities are always attached, switched_{to,from}_fair() deal with !fair. This directly follows from the definition of the cfs_rq averages, namely that they are a direct sum of all (runnable or blocked) entities on that rq. It is the second rule that this patch enforces, but it adds comments pertaining to all of them. Change-Id: Icdc906e98c67b84cb9582c893bc761a9886be57a Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 3d30544f02120b884bba2a9466c87dba980e3be5) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 85 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04dea070fc6e..8459caa2e8df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -718,6 +718,11 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); + /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -747,7 +752,9 @@ void post_init_entity_util_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2; + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -765,6 +772,29 @@ void post_init_entity_util_avg(struct sched_entity *se) */ sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = now; + return; + } + } + + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); @@ -776,7 +806,10 @@ void init_entity_runnable_average(struct sched_entity *se) void post_init_entity_util_avg(struct sched_entity *se) { } -#endif +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -2823,9 +2856,25 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, - bool update_freq) +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed utilization. It is expected + * that one calls update_tg_load_avg() on this condition, but after you've + * modified the cfs_rq avg (attach/detach), such that we propagate the new + * avg up. + */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; int decayed, removed = 0, removed_util = 0; @@ -2884,6 +2933,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) trace_sched_load_avg_task(task_of(se), &se->avg); } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (!sched_feat(ATTACH_AGE_LOAD)) @@ -2913,6 +2970,14 @@ skip_aging: cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -9322,6 +9387,8 @@ static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (!vruntime_normalized(p)) { /* @@ -9333,13 +9400,18 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -9350,7 +9422,10 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; From f9bef52c8505c29b24bff420dba15cea5ee581fe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Jul 2016 10:56:25 +0200 Subject: [PATCH 0853/3220] UPSTREAM: sched/fair: Improve PELT stuff some more Vincent noted that the update_tg_load_avg() usage in commit: 3d30544f0212 ("sched/fair: Apply more PELT fixes") isn't entirely sufficient. We need to call this function every time cfs_rq->avg.load changes, this includes when update_cfs_rq_load_avg() returns true, but {attach,detach}_entity_load_avg() themselves also change it. This means we need to unconditionally call update_tg_load_avg(). Also, add more comments. Change-Id: I7e55fceb587601f73c760c8b0d47a7ef2b777b9e Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 7c3edd2c300b7ef2005a69dc727692ee07434aa5) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8459caa2e8df..3f2606842d02 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -754,7 +754,6 @@ void post_init_entity_util_avg(struct sched_entity *se) struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -791,10 +790,9 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); @@ -2796,9 +2794,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -2868,10 +2878,10 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed utilization. It is expected - * that one calls update_tg_load_avg() on this condition, but after you've - * modified the cfs_rq avg (attach/detach), such that we propagate the new - * avg up. + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -9388,7 +9398,6 @@ static void detach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (!vruntime_normalized(p)) { /* @@ -9400,10 +9409,9 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) @@ -9411,7 +9419,6 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -9422,10 +9429,9 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; From 18d09a45eceb888e6cdc62b9e6beaa7bf5e15045 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:42 +0100 Subject: [PATCH 0854/3220] UPSTREAM: sched/fair: Factorize attach/detach entity Factorize post_init_entity_util_avg() and part of attach_task_cfs_rq() in one function attach_entity_cfs_rq(). Create symmetric detach_entity_cfs_rq() function. Change-Id: I44fc6bb5e71460be65f6b8928d4620c6c27a6a67 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit df217913e72ec7e603d8b68cc4c70646cf7000db) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 53 ++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3f2606842d02..58b84c8d7071 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -719,9 +719,7 @@ void init_entity_runnable_average(struct sched_entity *se) } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +static void attach_entity_cfs_rq(struct sched_entity *se); /* * With new tasks being created, their initial util_avgs are extrapolated @@ -753,7 +751,6 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; - u64 now = cfs_rq_clock_task(cfs_rq); if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -785,14 +782,12 @@ void post_init_entity_util_avg(struct sched_entity *se) * such that the next switched_to_fair() has the * expected state. */ - se->avg.last_update_time = now; + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); return; } } - update_cfs_rq_load_avg(now, cfs_rq, false); - attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + attach_entity_cfs_rq(se); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); @@ -9393,30 +9388,19 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } -static void detach_task_cfs_rq(struct task_struct *p) +static void detach_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; - } - /* Catch up with the cfs_rq and remove our load when we leave */ update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); } -static void attach_task_cfs_rq(struct task_struct *p) +static void attach_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); @@ -9428,10 +9412,35 @@ static void attach_task_cfs_rq(struct task_struct *p) se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - /* Synchronize task with its cfs_rq */ + /* Synchronize entity with its cfs_rq */ update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } + + detach_entity_cfs_rq(se); +} + +static void attach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; From 723dab78719fea22340cc06992441d05a50b55e5 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:44 +0100 Subject: [PATCH 0855/3220] BACKPORT: sched/fair: Factorize PELT update Every time we modify load/utilization of sched_entity, we start to sync it with its cfs_rq. This update is done in different ways: - when attaching/detaching a sched_entity, we update cfs_rq and then we sync the entity with the cfs_rq. - when enqueueing/dequeuing the sched_entity, we update both sched_entity and cfs_rq metrics to now. Use update_load_avg() everytime we have to update and sync cfs_rq and sched_entity before changing the state of a sched_enity. Change-Id: Ibde9a7e07ac80e9d5753bb4a0c30dfb3643cc666 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar [backported FROMLIST] Signed-off-by: Andres Oportus (cherry picked from commit d31b1a66cbe0931733583ad9d9e8c6cfd710907d) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 78 ++++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 50 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 58b84c8d7071..411f7749e73d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2916,8 +2916,14 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return decayed || removed; } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); @@ -2927,11 +2933,13 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } - if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) + if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); if (entity_is_task(se)) @@ -2948,24 +2956,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; @@ -2985,9 +2975,6 @@ skip_aging: */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); @@ -3002,34 +2989,20 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = @@ -3122,11 +3095,16 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { - cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); + return 0; } +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 + +static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void @@ -3269,6 +3247,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3344,6 +3323,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); @@ -3434,7 +3414,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -3550,7 +3530,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); + update_load_avg(curr, UPDATE_TG); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -4479,7 +4459,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -4581,7 +4561,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -9391,10 +9371,9 @@ static inline bool vruntime_normalized(struct task_struct *p) static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); /* Catch up with the cfs_rq and remove our load when we leave */ - update_cfs_rq_load_avg(now, cfs_rq, false); + update_load_avg(se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); } @@ -9402,7 +9381,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se) static void attach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -9413,7 +9391,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) #endif /* Synchronize entity with its cfs_rq */ - update_cfs_rq_load_avg(now, cfs_rq, false); + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); } From 8370e07d82c92981e8cc9df180df6e943db179af Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:43 +0100 Subject: [PATCH 0856/3220] UPSTREAM: sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list Fix the insertion of cfs_rq in rq->leaf_cfs_rq_list to ensure that a child will always be called before its parent. The hierarchical order in shares update list has been introduced by commit: 67e86250f8ea ("sched: Introduce hierarchal order on shares update list") With the current implementation a child can be still put after its parent. Lets take the example of: root \ b /\ c d* | e* with root -> b -> c already enqueued but not d -> e so the leaf_cfs_rq_list looks like: head -> c -> b -> root -> tail The branch d -> e will be added the first time that they are enqueued, starting with e then d. When e is added, its parents is not already on the list so e is put at the tail : head -> c -> b -> root -> e -> tail Then, d is added at the head because its parent is already on the list: head -> d -> c -> b -> root -> e -> tail e is not placed at the right position and will be called the last whereas it should be called at the beginning. Because it follows the bottom-up enqueue sequence, we are sure that we will finished to add either a cfs_rq without parent or a cfs_rq with a parent that is already on the list. We can use this event to detect when we have finished to add a new branch. For the others, whose parents are not already added, we have to ensure that they will be added after their children that have just been inserted the steps before, and after any potential parents that are already in the list. The easiest way is to put the cfs_rq just after the last inserted one and to keep track of it untl the branch is fully added. Change-Id: I4fe0b8502ea628c13d14e8e5c5279bce67fb8845 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit 9c2791f936ef5fd04a118b5c284f2c9a95f4a647) Signed-off-by: Chris Redpath --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 54 ++++++++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 1 + 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 00e969d7b2ad..defc88cda733 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7814,6 +7814,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* * How much cpu bandwidth does root_task_group get? * diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 411f7749e73d..15ccfbff1bde 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -305,19 +305,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; } cfs_rq->on_list = 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 50b9229d47ab..5660e8495ae2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -621,6 +621,7 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* From e875665411983d39589f0ba5416e0b99c6e0d2cb Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:45 +0100 Subject: [PATCH 0857/3220] UPSTREAM: sched/fair: Propagate load during synchronous attach/detach When a task moves from/to a cfs_rq, we set a flag which is then used to propagate the change at parent level (sched_entity and cfs_rq) during next update. If the cfs_rq is throttled, the flag will stay pending until the cfs_rq is unthrottled. For propagating the utilization, we copy the utilization of group cfs_rq to the sched_entity. For propagating the load, we have to take into account the load of the whole task group in order to evaluate the load of the sched_entity. Similarly to what was done before the rewrite of PELT, we add a correction factor in case the task group's load is greater than its share so it will contribute the same load of a task of equal weight. Change-Id: Id34a9888484716961c9027299c0b4d82881a39d1 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit 09a43ace1f986b003c118fdf6ddf1fd685692d49) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 240 ++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 1 + 2 files changed, 240 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 15ccfbff1bde..a8f86f214b45 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2828,6 +2828,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return decayed; } +/* + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -2849,14 +2869,196 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + /* + * No need to update load_avg for root_task_group as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } +/* + * Called within set_task_rq() right before setting a task's cpu. The + * caller only guarantees p->pi_lock is held; no other assumptions, + * including the state of rq->lock, should be made. + */ +void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) +{ + if (!sched_feat(ATTACH_AGE_LOAD)) + return; + + /* + * We are supposed to update the task to "current" time, then its up to + * date and ready to go to new CPU/cfs_rq. But we have difficulty in + * getting what current time is, so simply throw away the out-of-date + * time. This will result in the wakee task is less decayed, but giving + * the wakee more load sounds not bad. + */ + if (se->avg.last_update_time && prev) { + u64 p_last_update_time; + u64 n_last_update_time; + +#ifndef CONFIG_64BIT + u64 p_last_update_time_copy; + u64 n_last_update_time_copy; + + do { + p_last_update_time_copy = prev->load_last_update_time_copy; + n_last_update_time_copy = next->load_last_update_time_copy; + + smp_rmb(); + + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; + + } while (p_last_update_time != p_last_update_time_copy || + n_last_update_time != n_last_update_time_copy); +#else + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; +#endif + __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), + &se->avg, 0, 0, NULL); + se->avg.last_update_time = n_last_update_time; + } +} + +/* Take into account change of utilization of a child task group */ +static inline void +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's utilization */ + se->avg.util_avg = gcfs_rq->avg.util_avg; + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq utilization */ + add_positive(&cfs_rq->avg.util_avg, delta); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; +} + +/* Take into account change of load of a child task group */ +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta, load = gcfs_rq->avg.load_avg; + + /* + * If the load of group cfs_rq is null, the load of the + * sched_entity will also be null so we can skip the formula + */ + if (load) { + long tg_load; + + /* Get tg's load and ensure tg_load > 0 */ + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + + /* Ensure tg_load >= load and updated with current load*/ + tg_load -= gcfs_rq->tg_load_avg_contrib; + tg_load += load; + + /* + * We need to compute a correction term in the case that the + * task group is consuming more CPU than a task of equal + * weight. A task with a weight equals to tg->shares will have + * a load less or equal to scale_load_down(tg->shares). + * Similarly, the sched_entities that represent the task group + * at parent level, can't have a load higher than + * scale_load_down(tg->shares). And the Sum of sched_entities' + * load must be <= scale_load_down(tg->shares). + */ + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { + /* scale gcfs_rq's load into tg's shares*/ + load *= scale_load_down(gcfs_rq->tg->shares); + load /= tg_load; + } + } + + delta = load - se->avg.load_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's load */ + se->avg.load_avg = load; + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq load */ + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + + /* + * If the sched_entity is already enqueued, we also have to update the + * runnable load avg. + */ + if (se->on_rq) { + /* Update parent cfs_rq runnable_load_avg */ + add_positive(&cfs_rq->runnable_load_avg, delta); + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + } +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +{ + cfs_rq->propagate_avg = 1; +} + +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + + if (!cfs_rq->propagate_avg) + return 0; + + cfs_rq->propagate_avg = 0; + return 1; +} + +/* Update task and its cfs_rq load average */ +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + if (entity_is_task(se)) + return 0; + + if (!test_and_clear_tg_cfs_propagate(se)) + return 0; + + cfs_rq = cfs_rq_of(se); + + set_tg_cfs_propagate(cfs_rq); + + update_tg_cfs_util(cfs_rq, se); + update_tg_cfs_load(cfs_rq, se); + + return 1; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ + static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} + +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) @@ -2968,6 +3170,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); int cpu = cpu_of(rq_of(cfs_rq)); + int decayed; /* * Track task load average for carrying it to new CPU after migrated, and @@ -2979,7 +3182,10 @@ static inline void update_load_avg(struct sched_entity *se, int flags) cfs_rq->curr == se, NULL); } - if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG)) + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed |= propagate_entity_load_avg(se); + + if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); if (entity_is_task(se)) @@ -3001,6 +3207,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -3020,6 +3227,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -9408,6 +9616,31 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Propagate the changes of the sched_entity across the tg tree to make it + * visible to the root + */ +static void propagate_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + /* Start to propagate at parent */ + se = se->parent; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); + } +} +#else +static void propagate_entity_cfs_rq(struct sched_entity *se) { } +#endif + static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -9416,6 +9649,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) update_load_avg(se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); } static void attach_entity_cfs_rq(struct sched_entity *se) @@ -9434,6 +9668,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); } static void detach_task_cfs_rq(struct task_struct *p) @@ -9512,6 +9747,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->propagate_avg = 0; +#endif atomic_long_set(&cfs_rq->removed_load_avg, 0); atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5660e8495ae2..d0971e5c9269 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -376,6 +376,7 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; + unsigned long propagate_avg; #endif atomic_long_t removed_load_avg, removed_util_avg; #ifndef CONFIG_64BIT From 89e4d18a6712b6452c577776abb8536913b6c1fb Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:46 +0100 Subject: [PATCH 0858/3220] UPSTREAM: sched/fair: Propagate asynchrous detach A task can be asynchronously detached from cfs_rq when migrating between CPUs. The load of the migrated task is then removed from source cfs_rq during its next update. We use this event to set propagation flag. During the load balance, we take advantage of the update of blocked load to propagate any pending changes. The propagation relies on patch: "sched: Fix hierarchical order in rq->leaf_cfs_rq_list" ... which orders children and parents, to ensure that it's done in one pass. Change-Id: I33782e35fc4711f5901e8c23d6aa7ec5f2ff7ee5 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit 4e5160766fcc9f41bbd38bac11f92dce993644aa) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a8f86f214b45..e49408d0f590 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3131,6 +3131,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed = 1; + set_tg_cfs_propagate(cfs_rq); } if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -3138,6 +3139,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; + set_tg_cfs_propagate(cfs_rq); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -7347,6 +7349,10 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); + + /* Propagate pending load changes to the parent */ + if (cfs_rq->tg->se[cpu]) + update_load_avg(cfs_rq->tg->se[cpu], 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } From 640c909c3470b8b8882676754485d5ee0ccccf7d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jun 2016 15:53:54 +0200 Subject: [PATCH 0859/3220] UPSTREAM: sched/fair: Fix effective_load() to consistently use smoothed load Starting with the following commit: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") calc_tg_weight() doesn't compute the right value as expected by effective_load(). The difference is in the 'correction' term. In order to ensure \Sum rw_j >= rw_i we cannot use tg->load_avg directly, since that might be lagging a correction on the current cfs_rq->avg.load_avg value. Therefore we use tg->load_avg - cfs_rq->tg_load_avg_contrib + cfs_rq->avg.load_avg. Now, per the referenced commit, calc_tg_weight() doesn't use cfs_rq->avg.load_avg, as is later used in @w, but uses cfs_rq->load.weight instead. So stop using calc_tg_weight() and do it explicitly. The effects of this bug are wake_affine() making randomly poor choices in cgroup-intense workloads. Change-Id: I1c0058ff674650cf295c8dc3b88a5a3de4bddab0 Signed-off-by: Peter Zijlstra (Intel) Cc: # v4.3+ Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") Signed-off-by: Ingo Molnar (cherry picked from commit 7dd4912594daf769a46744848b05bd5bc6d62469) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e49408d0f590..df6fc28f80c7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -830,8 +830,6 @@ void post_init_entity_util_avg(struct sched_entity *se) attach_entity_cfs_rq(se); } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { From 20bbd92679ce1722c0b037b6eb49809a1c9cfd71 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 19 Oct 2016 14:45:23 +0200 Subject: [PATCH 0860/3220] UPSTREAM: sched/fair: Fix incorrect task group ->load_avg A scheduler performance regression has been reported by Joseph Salisbury, which he bisected back to: 3d30544f0212 ("sched/fair: Apply more PELT fixes) The regression triggers when several levels of task groups are involved (read: SystemD) and cpu_possible_mask != cpu_present_mask. The root cause is that group entity's load (tg_child->se[i]->avg.load_avg) is initialized to scale_load_down(se->load.weight). During the creation of a child task group, its group entities on possible CPUs are attached to parent's cfs_rq (tg_parent) and their loads are added to the parent's load (tg_parent->load_avg) with update_tg_load_avg(). But only the load on online CPUs will then be updated to reflect real load, whereas load on other CPUs will stay at the initial value. The result is a tg_parent->load_avg that is higher than the real load, the weight of group entities (tg_parent->se[i]->load.weight) on online CPUs is smaller than it should be, and the task group gets a less running time than what it could expect. ( This situation can be detected with /proc/sched_debug. The ".tg_load_avg" of the task group will be much higher than sum of ".tg_load_avg_contrib" of online cfs_rqs of the task group. ) The load of group entities don't have to be intialized to something else than 0 because their load will increase when an entity is attached. Change-Id: Ie55021ff98ba49016adfddb2444e9c9709939226 Reported-by: Joseph Salisbury Tested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Cc: # 4.8.x Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: joonwoop@codeaurora.org Fixes: 3d30544f0212 ("sched/fair: Apply more PELT fixes) Link: http://lkml.kernel.org/r/1476881123-10159-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit b5a9b340789b2b24c6896bcf7a065c31a4db671c) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df6fc28f80c7..f3fc9f4f353e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -739,7 +739,14 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* * In previous Android versions, we used to have: From baaa21b59be28c62746fd4f1465730bd42d1d5fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jun 2016 16:11:02 +0200 Subject: [PATCH 0861/3220] UPSTREAM: sched/fair: Fix calc_cfs_shares() fixed point arithmetics width confusion Commit: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") did something non-obvious but also did it buggy yet latent. The problem was exposed for real by a later commit in the v4.7 merge window: 2159197d6677 ("sched/core: Enable increased load resolution on 64-bit kernels") ... after which tg->load_avg and cfs_rq->load.weight had different units (10 bit fixed point and 20 bit fixed point resp.). Add a comment to explain the use of cfs_rq->load.weight over the 'natural' cfs_rq->avg.load_avg and add scale_load_down() to correct for the difference in unit. Since this is (now, as per a previous commit) the only user of calc_tg_weight(), collapse it. The effects of this bug should be randomly inconsistent SMP-balancing of cgroups workloads. Change-Id: If1e565662ea163485edd94a12aef644d0e0dfe7a Reported-by: Jirka Hladky Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 2159197d6677 ("sched/core: Enable increased load resolution on 64-bit kernels") Fixes: fde7d22e01aa ("sched/fair: Fix overly small weight for interactive group entities") Signed-off-by: Ingo Molnar (cherry picked from commit ea1dc6fc6242f991656e35e2ed3d90ec1cd13418) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f3fc9f4f353e..911cb335e873 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2530,28 +2530,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) -{ - long tg_weight; - - /* - * Use this CPU's real-time load instead of the last load contribution - * as the updating of the contribution is delayed, and we will use the - * the real-time load to calc the share. See update_tg_load_avg(). - */ - tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq->load.weight; - - return tg_weight; -} - static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { long tg_weight, load, shares; - tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + /* + * This really should be: cfs_rq->avg.load_avg, but instead we use + * cfs_rq->load.weight, which is its upper bound. This helps ramp up + * the shares for small weight interactive tasks. + */ + load = scale_load_down(cfs_rq->load.weight); + + tg_weight = atomic_long_read(&tg->load_avg); + + /* Ensure tg_weight >= load */ + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += load; shares = (tg->shares * load); if (tg_weight) @@ -2570,6 +2564,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return tg->shares; } # endif /* CONFIG_SMP */ + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { From e62a1ca36b90450f37d570fc98041a164a5778ed Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 21 Dec 2016 16:50:26 +0100 Subject: [PATCH 0862/3220] UPSTREAM: sched/core: Fix group_entity's share update The update of the share of a cfs_rq is done when its load_avg is updated but before the group_entity's load_avg has been updated for the past time slot. This generates wrong load_avg accounting which can be significant when small tasks are involved in the scheduling. Let take the example of a task a that is dequeued of its task group A: root (cfs_rq) \ (se) A (cfs_rq) \ (se) a Task "a" was the only task in task group A which becomes idle when a is dequeued. We have the sequence: - dequeue_entity a->se - update_load_avg(a->se) - dequeue_entity_load_avg(A->cfs_rq, a->se) - update_cfs_shares(A->cfs_rq) A->cfs_rq->load.weight == 0 A->se->load.weight is updated with the new share (0 in this case) - dequeue_entity A->se - update_load_avg(A->se) but its weight is now null so the last time slot (up to a tick) will be accounted with a weight of 0 instead of its real weight during the time slot. The last time slot will be accounted as an idle one whereas it was a running one. If the running time of task a is short enough that no tick happens when it runs, all running time of group entity A->se will be accounted as idle time. Instead, we should update the share of a cfs_rq (in fact the weight of its group entity) only after having updated the load_avg of the group_entity. update_cfs_shares() now takes the sched_entity as a parameter instead of the cfs_rq, and the weight of the group_entity is updated only once its load_avg has been synced with current time. Change-Id: Id6ce3be1767b44b444ce2a77ed1ba063e57c0664 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: pjt@google.com Link: http://lkml.kernel.org/r/1482335426-7664-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit 89ee048f3cc796db6f26906c6bef4edf0bee70fd) [minor cherry pick stuff] Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 911cb335e873..d2479dc60e99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2583,16 +2583,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct sched_entity *se) { + struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg; - struct sched_entity *se; long shares; - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) + if (!cfs_rq) return; + + if (throttled_hierarchy(cfs_rq)) + return; + + tg = cfs_rq->tg; + #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) return; @@ -2601,8 +2605,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } + #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3499,8 +3504,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); + update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -3573,6 +3578,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Substract its load from the cfs_rq->runnable_avg. + * - Substract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); @@ -3609,7 +3623,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } /* @@ -3781,7 +3795,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_load_avg(curr, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -4710,7 +4724,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -4812,7 +4826,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -9920,8 +9934,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Possible calls to update_curr() need rq clock */ update_rq_clock(rq); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); + for_each_sched_entity(se) { + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } From 55af3848151fb76013d427ea8dfff9f829f98ef6 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 30 Jul 2015 16:53:30 +0100 Subject: [PATCH 0863/3220] sched: EAS & 'single cpu per cluster'/cpu hotplug interoperability For Energy-Aware Scheduling (EAS) to work properly, even in the case that there is only one cpu per cluster or that cpus are hot-plugged out, the Energy Model (EM) data on all energy-aware sched domains (sd) has to be present for all online cpus. Mainline sd hierarchy setup code will remove sd's which are not useful for task scheduling e.g. in the following situations: 1. Only 1 cpu is/remains in one cluster of a multi cluster system. This remaining cpu only has DIE and no MC sd. 2. A complete cluster in a two cluster system is hot-plugged out. The cpus of the remaining cluster only have MC and no DIE sd. To make sure that all online cpus keep all their energy-aware sd's, the sd degenerate functionality has been changed to not free a sd if its first sched group (sg) contains EM data in case: 1. There is only 1 cpu left in the sd. 2. There have to be at least 2 sg's if certain sd flags are set. Instead of freeing such a sd it now clears only its SD_LOAD_BALANCE flag. This will make sure that the EAS functionality will always see all energy-aware sd's for all online cpus. It will introduce a tiny performance degradation for operations on affected cpus since the hot-path macro for_each_domain() has to deal with sd's not contributing to task scheduling at all now. In most cases the exisiting code makes sure that task scheduling is not invoked on a sd with !SD_LOAD_BALANCE. However, a small change is necessary in update_sd_lb_stats() to make sure that sd->parent is only initialized to !NULL in case the parent sd contains more than 1 sg. The handling of newidle decay values before the SD_LOAD_BALANCE check in rebalance_domains() stays unchanged. Test (w/ CONFIG_SCHED_DEBUG): JUNO r0 default system: $ cat /proc/cpuinfo | grep "^CPU part" CPU part : 0xd03 CPU part : 0xd07 CPU part : 0xd07 CPU part : 0xd03 CPU part : 0xd03 CPU part : 0xd03 SD names and flags: $ cat /proc/sys/kernel/sched_domain/cpu*/domain*/name MC DIE MC DIE MC DIE MC DIE MC DIE MC DIE $ printf "%x\n" `cat /proc/sys/kernel/sched_domain/cpu*/domain*/flags` 832f 102f 832f 102f 832f 102f 832f 102f 832f 102f 832f 102f Test 1: Hotplug-out one A57 (CPU part 0xd07) cpu: $ echo 0 > /sys/devices/system/cpu/cpu1/online $ cat /proc/cpuinfo | grep "^CPU part" CPU part : 0xd03 CPU part : 0xd07 CPU part : 0xd03 CPU part : 0xd03 CPU part : 0xd03 SD names and flags for remaining A57 (cpu2) cpu: $ cat /proc/sys/kernel/sched_domain/cpu2/domain*/name MC DIE $ printf "%x\n" `cat /proc/sys/kernel/sched_domain/cpu2/domain*/flags` 832e <-- MC SD with !SD_LOAD_BALANCE 102f Test 2: Hotplug-out the entire A57 cluster: $ echo 0 > /sys/devices/system/cpu/cpu1/online $ echo 0 > /sys/devices/system/cpu/cpu2/online $ cat /proc/cpuinfo | grep "^CPU part" CPU part : 0xd03 CPU part : 0xd03 CPU part : 0xd03 CPU part : 0xd03 SD names and flags for the remaining A53 (CPU part 0xd03) cluster: $ cat /proc/sys/kernel/sched_domain/cpu*/domain*/name MC DIE MC DIE MC DIE MC DIE $ printf "%x\n" `cat /proc/sys/kernel/sched_domain/cpu*/domain*/flags` 832f 102e <-- DIE SD with !SD_LOAD_BALANCE 832f 102e 832f 102e 832f 102e Change-Id: If24aa2b2628f334abbf0207d39e2a86168d9d673 Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 17 ++++++++++------- kernel/sched/fair.c | 7 +++++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index defc88cda733..da44d13f2439 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5934,9 +5934,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); return -1; } @@ -6029,8 +6026,12 @@ static inline bool sched_debug(void) static int sd_degenerate(struct sched_domain *sd) { - if (cpumask_weight(sched_domain_span(sd)) == 1) - return 1; + if (cpumask_weight(sched_domain_span(sd)) == 1) { + if (sd->groups->sge) + sd->flags &= ~SD_LOAD_BALANCE; + else + return 1; + } /* Following flags need at least 2 groups */ if (sd->flags & (SD_LOAD_BALANCE | @@ -6076,6 +6077,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_PREFER_SIBLING | SD_SHARE_POWERDOMAIN | SD_SHARE_CAP_STATES); + if (parent->groups->sge) { + parent->flags &= ~SD_LOAD_BALANCE; + return 0; + } if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -7353,8 +7358,6 @@ static int build_sched_domains(const struct cpumask *cpu_map, *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; - if (cpumask_equal(cpu_map, sched_domain_span(sd))) - break; } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d2479dc60e99..c84e15e3cd56 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7971,6 +7971,9 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ +#define lb_sd_parent(sd) \ + (sd->parent && sd->parent->groups != sd->parent->groups->next) + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -8053,7 +8056,7 @@ next_group: env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; - if (!env->sd->parent) { + if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; @@ -8561,7 +8564,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; - struct sched_domain *sd_parent = sd->parent; + struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL; struct sched_group *group; struct rq *busiest; unsigned long flags; From aa8882923a3f89aa880b6eff6fccc70e77b4a7c2 Mon Sep 17 00:00:00 2001 From: Andres Oportus Date: Thu, 23 Feb 2017 11:58:22 -0800 Subject: [PATCH 0864/3220] sched/core: Fix PELT jump to max OPP upon util increase Change-Id: Ic80b588ec466ef707f658dcea039fd0d6b384b63 Signed-off-by: Andres Oportus --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da44d13f2439..e353de860bfd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2969,9 +2969,10 @@ unsigned long sum_capacity_reqs(unsigned long cfs_cap, return total += scr->dl; } +unsigned long boosted_cpu_util(int cpu); static void sched_freq_tick_pelt(int cpu) { - unsigned long cpu_utilization = capacity_max; + unsigned long cpu_utilization = boosted_cpu_util(cpu); unsigned long capacity_curr = capacity_curr_of(cpu); struct sched_capacity_reqs *scr; From 4b85765a3dd9e1241e2cfbb8bd600f88411cfa0a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 22 Mar 2017 18:23:13 +0000 Subject: [PATCH 0865/3220] sched/fair: Add eas (& cas) specific rq, sd and task stats The statistic counter are placed in the eas (& cas) wakeup path. Each of them has one representation for the runqueue (rq), the sched_domain (sd) and the task. A task counter is always incremented. A rq counter is always incremented for the rq the scheduler is currently running on. A sd counter is only incremented if a relation to a sd exists. The counters are exposed: (1) In /proc/schedstat for rq's and sd's: $ cat /proc/schedstat ... cpu0 71422 0 2321254 ... eas 44144 0 0 19446 0 24698 568435 51621 156932 133 222011 17459 120279 516814 83 0 156962 359235 176439 139981 <- runqueue for cpu0 ... domain0 3 42430 42331 ... eas 0 0 0 14200 0 0 0 0 0 0 0 0 0 0 0 0 0 0 66355 0 <- MC sched domain for cpu0 ... The per-cpu eas vector has the following elements: sis_attempts sis_idle sis_cache_affine sis_suff_cap sis_idle_cpu sis_count || secb_attempts secb_sync secb_idle_bt secb_insuff_cap secb_no_nrg_sav secb_nrg_sav secb_count || fbt_attempts fbt_no_cpu fbt_no_sd fbt_pref_idle fbt_count || cas_attempts cas_count The following relations exist between these counters (from cpu0 eas vector above): sis_attempts = sis_idle + sis_cache_affine + sis_suff_cap + sis_idle_cpu + sis_count 44144 = 0 + 0 + 19446 + 0 + 24698 secb_attempts = secb_sync + secb_idle_bt + secb_insuff_cap + secb_no_nrg_sav + secb_nrg_sav + secb_count 568435 = 51621 + 156932 + 133 + 222011 + 17459 + 120279 fbt_attempts = fbt_no_cpu + fbt_no_sd + fbt_pref_idle + fbt_count + (return -1) 516814 = 83 + 0 + 156962 + 359235 + (534) cas_attempts = cas_count + (return -1 or smp_processor_id()) 176439 = 139981 + (36458) (2) In /proc/$PROCESS_PID/task/$TASK_PID/sched for a task. example: main thread of system_server $ cat /proc/1083/task/1083/sched ... se.statistics.nr_wakeups_sis_attempts : 945 se.statistics.nr_wakeups_sis_idle : 0 se.statistics.nr_wakeups_sis_cache_affine : 0 se.statistics.nr_wakeups_sis_suff_cap : 219 se.statistics.nr_wakeups_sis_idle_cpu : 0 se.statistics.nr_wakeups_sis_count : 726 se.statistics.nr_wakeups_secb_attempts : 10376 se.statistics.nr_wakeups_secb_sync : 1462 se.statistics.nr_wakeups_secb_idle_bt : 6984 se.statistics.nr_wakeups_secb_insuff_cap : 3 se.statistics.nr_wakeups_secb_no_nrg_sav : 927 se.statistics.nr_wakeups_secb_nrg_sav : 206 se.statistics.nr_wakeups_secb_count : 794 se.statistics.nr_wakeups_fbt_attempts : 8914 se.statistics.nr_wakeups_fbt_no_cpu : 0 se.statistics.nr_wakeups_fbt_no_sd : 0 se.statistics.nr_wakeups_fbt_pref_idle : 6987 se.statistics.nr_wakeups_fbt_count : 1554 se.statistics.nr_wakeups_cas_attempts : 3107 se.statistics.nr_wakeups_cas_count : 1195 ... The same relation between the counters as in the per-cpu case apply. Change-Id: Ie7d01267c78a3f41f60a3ef52917d5a5d463f195 Signed-off-by: Dietmar Eggemann Signed-off-by: Chris Redpath --- include/linux/sched.h | 62 +++++++++++++++++ kernel/sched/debug.c | 26 ++++++++ kernel/sched/fair.c | 152 ++++++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 2 + kernel/sched/stats.c | 23 +++++++ 5 files changed, 228 insertions(+), 37 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 436f36f768c6..ad2c304b29b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1065,6 +1065,37 @@ unsigned long capacity_curr_of(int cpu); struct sched_group; +struct eas_stats { + /* select_idle_sibling() stats */ + u64 sis_attempts; + u64 sis_idle; + u64 sis_cache_affine; + u64 sis_suff_cap; + u64 sis_idle_cpu; + u64 sis_count; + + /* select_energy_cpu_brute() stats */ + u64 secb_attempts; + u64 secb_sync; + u64 secb_idle_bt; + u64 secb_insuff_cap; + u64 secb_no_nrg_sav; + u64 secb_nrg_sav; + u64 secb_count; + + /* find_best_target() stats */ + u64 fbt_attempts; + u64 fbt_no_cpu; + u64 fbt_no_sd; + u64 fbt_pref_idle; + u64 fbt_count; + + /* cas */ + /* select_task_rq_fair() stats */ + u64 cas_attempts; + u64 cas_count; +}; + struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ @@ -1125,6 +1156,8 @@ struct sched_domain { unsigned int ttwu_wake_remote; unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; + + struct eas_stats eas_stats; #endif #ifdef CONFIG_SCHED_DEBUG char *name; @@ -1283,6 +1316,35 @@ struct sched_statistics { u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; + + /* select_idle_sibling() */ + u64 nr_wakeups_sis_attempts; + u64 nr_wakeups_sis_idle; + u64 nr_wakeups_sis_cache_affine; + u64 nr_wakeups_sis_suff_cap; + u64 nr_wakeups_sis_idle_cpu; + u64 nr_wakeups_sis_count; + + /* energy_aware_wake_cpu() */ + u64 nr_wakeups_secb_attempts; + u64 nr_wakeups_secb_sync; + u64 nr_wakeups_secb_idle_bt; + u64 nr_wakeups_secb_insuff_cap; + u64 nr_wakeups_secb_no_nrg_sav; + u64 nr_wakeups_secb_nrg_sav; + u64 nr_wakeups_secb_count; + + /* find_best_target() */ + u64 nr_wakeups_fbt_attempts; + u64 nr_wakeups_fbt_no_cpu; + u64 nr_wakeups_fbt_no_sd; + u64 nr_wakeups_fbt_pref_idle; + u64 nr_wakeups_fbt_count; + + /* cas */ + /* select_task_rq_fair() */ + u64 nr_wakeups_cas_attempts; + u64 nr_wakeups_cas_count; }; #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771ae6..7f7116622631 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -597,6 +597,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.nr_wakeups_affine_attempts); P(se.statistics.nr_wakeups_passive); P(se.statistics.nr_wakeups_idle); + /* eas */ + /* select_idle_sibling() */ + P(se.statistics.nr_wakeups_sis_attempts); + P(se.statistics.nr_wakeups_sis_idle); + P(se.statistics.nr_wakeups_sis_cache_affine); + P(se.statistics.nr_wakeups_sis_suff_cap); + P(se.statistics.nr_wakeups_sis_idle_cpu); + P(se.statistics.nr_wakeups_sis_count); + /* select_energy_cpu_brute() */ + P(se.statistics.nr_wakeups_secb_attempts); + P(se.statistics.nr_wakeups_secb_sync); + P(se.statistics.nr_wakeups_secb_idle_bt); + P(se.statistics.nr_wakeups_secb_insuff_cap); + P(se.statistics.nr_wakeups_secb_no_nrg_sav); + P(se.statistics.nr_wakeups_secb_nrg_sav); + P(se.statistics.nr_wakeups_secb_count); + /* find_best_target() */ + P(se.statistics.nr_wakeups_fbt_attempts); + P(se.statistics.nr_wakeups_fbt_no_cpu); + P(se.statistics.nr_wakeups_fbt_no_sd); + P(se.statistics.nr_wakeups_fbt_pref_idle); + P(se.statistics.nr_wakeups_fbt_count); + /* cas */ + /* select_task_rq_fair() */ + P(se.statistics.nr_wakeups_cas_attempts); + P(se.statistics.nr_wakeups_cas_count); { u64 avg_atom, avg_per_cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c84e15e3cd56..dfd150b02e63 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6059,15 +6059,24 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) int best_idle_cstate = INT_MAX; unsigned long best_idle_capacity = ULONG_MAX; + schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts); + schedstat_inc(this_rq(), eas_stats.sis_attempts); + if (!sysctl_sched_cstate_aware) { - if (idle_cpu(target)) + if (idle_cpu(target)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_idle); + schedstat_inc(this_rq(), eas_stats.sis_idle); return target; + } /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine); + schedstat_inc(this_rq(), eas_stats.sis_cache_affine); return prev; + } } /* @@ -6091,8 +6100,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (new_usage > capacity_orig || !idle_cpu(i)) goto next; - if (i == target && new_usage <= capacity_curr_of(target)) + if (i == target && new_usage <= capacity_curr_of(target)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap); + schedstat_inc(this_rq(), eas_stats.sis_suff_cap); + schedstat_inc(sd, eas_stats.sis_suff_cap); return target; + } if (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity) { @@ -6109,6 +6122,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); + schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu); + schedstat_inc(this_rq(), eas_stats.sis_idle_cpu); + schedstat_inc(sd, eas_stats.sis_idle_cpu); goto done; } next: @@ -6120,6 +6136,9 @@ next: target = best_idle_cpu; done: + schedstat_inc(p, se.statistics.nr_wakeups_sis_count); + schedstat_inc(this_rq(), eas_stats.sis_count); + return target; } @@ -6146,13 +6165,22 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre struct sched_group *sg; int cpu = start_cpu(boosted); - if (cpu < 0) + schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts); + schedstat_inc(this_rq(), eas_stats.fbt_attempts); + + if (cpu < 0) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu); + schedstat_inc(this_rq(), eas_stats.fbt_no_cpu); return target_cpu; + } sd = rcu_dereference(per_cpu(sd_ea, cpu)); - if (!sd) + if (!sd) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd); + schedstat_inc(this_rq(), eas_stats.fbt_no_sd); return target_cpu; + } sg = sd->groups; @@ -6191,8 +6219,11 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre * Unconditionally favoring tasks that prefer idle cpus to * improve latency. */ - if (idle_cpu(i) && prefer_idle) + if (idle_cpu(i) && prefer_idle) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); + schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); return i; + } cur_capacity = capacity_curr_of(i); @@ -6228,6 +6259,11 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre if (target_cpu < 0) target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; + if (target_cpu >= 0) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_count); + schedstat_inc(this_rq(), eas_stats.fbt_count); + } + return target_cpu; } @@ -6279,11 +6315,17 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync int target_cpu = prev_cpu, tmp_target; bool boosted, prefer_idle; + schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts); + schedstat_inc(this_rq(), eas_stats.secb_attempts); + if (sysctl_sched_sync_hint_enable && sync) { int cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_sync); + schedstat_inc(this_rq(), eas_stats.secb_sync); return cpu; + } } rcu_read_lock(); @@ -6303,8 +6345,11 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync goto unlock; if (tmp_target >= 0) { target_cpu = tmp_target; - if ((boosted || prefer_idle) && idle_cpu(target_cpu)) + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt); + schedstat_inc(this_rq(), eas_stats.secb_idle_bt); goto unlock; + } } if (target_cpu != prev_cpu) { @@ -6316,15 +6361,30 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync }; /* Not enough spare capacity on previous cpu */ - if (cpu_overutilized(prev_cpu)) + if (cpu_overutilized(prev_cpu)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap); + schedstat_inc(this_rq(), eas_stats.secb_insuff_cap); goto unlock; + } - if (energy_diff(&eenv) >= 0) + if (energy_diff(&eenv) >= 0) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); target_cpu = prev_cpu; + goto unlock; + } + + schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_nrg_sav); + goto unlock; } + schedstat_inc(p, se.statistics.nr_wakeups_secb_count); + schedstat_inc(this_rq(), eas_stats.secb_count); + unlock: rcu_read_unlock(); + return target_cpu; } @@ -6387,39 +6447,57 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } else while (sd) { - struct sched_group *group; - int weight; + } else { + int wu = sd_flag & SD_BALANCE_WAKE; + int cas_cpu = -1; - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; + if (wu) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); + schedstat_inc(this_rq(), eas_stats.cas_attempts); } - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; + while (sd) { + struct sched_group *group; + int weight; + + if (wu) + schedstat_inc(sd, eas_stats.cas_attempts); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = cas_cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ } - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; + if (wu && (cas_cpu >= 0)) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_count); + schedstat_inc(this_rq(), eas_stats.cas_count); } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ } rcu_read_unlock(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d0971e5c9269..6880fbc39760 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -734,6 +734,8 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; + + struct eas_stats eas_stats; #endif #ifdef CONFIG_SMP diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 87e2c9f0c33e..b63879918cd6 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -12,6 +12,26 @@ */ #define SCHEDSTAT_VERSION 15 +static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats) +{ + /* eas-specific runqueue stats */ + seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ", + stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine, + stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count); + + seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ", + stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt, + stats->secb_insuff_cap, stats->secb_no_nrg_sav, + stats->secb_nrg_sav, stats->secb_count); + + seq_printf(seq, "%llu %llu %llu %llu %llu ", + stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd, + stats->fbt_pref_idle, stats->fbt_count); + + seq_printf(seq, "%llu %llu\n", + stats->cas_attempts, stats->cas_count); +} + static int show_schedstat(struct seq_file *seq, void *v) { int cpu; @@ -39,6 +59,7 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); + show_easstat(seq, &rq->eas_stats); #ifdef CONFIG_SMP /* domain-specific stats */ rcu_read_lock(); @@ -66,6 +87,8 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + + show_easstat(seq, &sd->eas_stats); } rcu_read_unlock(); #endif From 8ac52cbaf41b01baa48fbbe65879734168037f15 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Wed, 8 Feb 2017 14:25:35 +0000 Subject: [PATCH 0866/3220] trace:sched: Make util_avg in load_avg trace reflect PELT/WALT as used With the ability to choose between WALT and PELT for utilisation tracking we can have the situation where we're using WALT to make all the decisions and reporting PELT figures in the sched_load_avg_(cpu|task) trace points. This is not too much of an issue, but when analysing trace it is nice to see numbers representing what the scheduler is using rather than needing to add in additional sched_walt_* traces to figure it out. Add reporting for both types, and make the util_avg member reflect what will be seen from cpu or task_util functions in the scheduler. Change-Id: I2abbd2c5fa70822096d0f3372b4c12b1c6af1590 Signed-off-by: Chris Redpath --- include/trace/events/sched.h | 44 +++++++++++++++++++++++++++++++----- kernel/sched/fair.c | 9 ++++++-- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 1d758d18a1b7..433d3919ba00 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -638,14 +638,21 @@ TRACE_EVENT(sched_contrib_scale_f, #ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int sysctl_sched_use_walt_task_util; +extern unsigned int walt_ravg_window; +extern unsigned int walt_disabled; +#endif + /* * Tracepoint for accounting sched averages for tasks. */ TRACE_EVENT(sched_load_avg_task, - TP_PROTO(struct task_struct *tsk, struct sched_avg *avg), + TP_PROTO(struct task_struct *tsk, struct sched_avg *avg, void *_ravg), - TP_ARGS(tsk, avg), + TP_ARGS(tsk, avg, _ravg), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -653,6 +660,8 @@ TRACE_EVENT(sched_load_avg_task, __field( int, cpu ) __field( unsigned long, load_avg ) __field( unsigned long, util_avg ) + __field( unsigned long, util_avg_pelt ) + __field( unsigned long, util_avg_walt ) __field( u64, load_sum ) __field( u32, util_sum ) __field( u32, period_contrib ) @@ -667,15 +676,25 @@ TRACE_EVENT(sched_load_avg_task, __entry->load_sum = avg->load_sum; __entry->util_sum = avg->util_sum; __entry->period_contrib = avg->period_contrib; + __entry->util_avg_pelt = avg->util_avg; + __entry->util_avg_walt = 0; +#ifdef CONFIG_SCHED_WALT + __entry->util_avg_walt = (((unsigned long)((struct ravg*)_ravg)->demand) << SCHED_LOAD_SHIFT); + do_div(__entry->util_avg_walt, walt_ravg_window); + if (!walt_disabled && sysctl_sched_use_walt_task_util) + __entry->util_avg = __entry->util_avg_walt; +#endif ), - - TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu" + TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu " + "util_avg_pelt=%lu util_avg_walt=%lu load_sum=%llu" " util_sum=%u period_contrib=%u", __entry->comm, __entry->pid, __entry->cpu, __entry->load_avg, __entry->util_avg, + __entry->util_avg_pelt, + __entry->util_avg_walt, (u64)__entry->load_sum, (u32)__entry->util_sum, (u32)__entry->period_contrib) @@ -694,16 +713,29 @@ TRACE_EVENT(sched_load_avg_cpu, __field( int, cpu ) __field( unsigned long, load_avg ) __field( unsigned long, util_avg ) + __field( unsigned long, util_avg_pelt ) + __field( unsigned long, util_avg_walt ) ), TP_fast_assign( __entry->cpu = cpu; __entry->load_avg = cfs_rq->avg.load_avg; __entry->util_avg = cfs_rq->avg.util_avg; + __entry->util_avg_pelt = cfs_rq->avg.util_avg; + __entry->util_avg_walt = 0; +#ifdef CONFIG_SCHED_WALT + __entry->util_avg_walt = + cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; + do_div(__entry->util_avg_walt, walt_ravg_window); + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + __entry->util_avg = __entry->util_avg_walt; +#endif ), - TP_printk("cpu=%d load_avg=%lu util_avg=%lu", - __entry->cpu, __entry->load_avg, __entry->util_avg) + TP_printk("cpu=%d load_avg=%lu util_avg=%lu " + "util_avg_pelt=%lu util_avg_walt=%lu", + __entry->cpu, __entry->load_avg, __entry->util_avg, + __entry->util_avg_pelt, __entry->util_avg_walt) ); /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dfd150b02e63..f972df2115d7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3178,6 +3178,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) u64 now = cfs_rq_clock_task(cfs_rq); int cpu = cpu_of(rq_of(cfs_rq)); int decayed; + void *ptr = NULL; /* * Track task load average for carrying it to new CPU after migrated, and @@ -3195,8 +3196,12 @@ static inline void update_load_avg(struct sched_entity *se, int flags) if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); - if (entity_is_task(se)) - trace_sched_load_avg_task(task_of(se), &se->avg); + if (entity_is_task(se)) { +#ifdef CONFIG_SCHED_WALT + ptr = (void *)&(task_of(se)->ravg); +#endif + trace_sched_load_avg_task(task_of(se), &se->avg, ptr); + } } /** From 8865f07600fad56a7fc6b96dc339bbe9a16eb7eb Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 28 Feb 2017 17:27:28 +0000 Subject: [PATCH 0867/3220] sched/fair: remove task util from own cpu when placing waking task When we place a waking task with find_best_target, we calculate the existing and new utilisation of each candidate cpu. However, we do not remove any blocked load resulting from the waking task on the previous cpu which might cause unnecessary migrations. Switch to using cpu_util_wake which does this for us, which requires moving cpu_util_wake a few functions earlier. Also, we have multiple potential cpu utilization signals here, so update the necessary bits to allow WALT to work properly (including not subtracting task util for WALT). When WALT is in use, cpu utilization is the utilization in the previous completed window, whilst the task utilization ignores fully idle windows. There seems to be no way to have a decently accurate estimate of how much (if any) utilization from this task remains on the prev cpu. Instead, just return cpu_util when we're using WALT. Change-Id: I448203ab98ffb5c020dfb6b218581eef1f5601f7 Reported-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 48 +++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f972df2115d7..27a6f5d6c86c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6147,6 +6147,34 @@ done: return target; } +/* + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. + */ +static int cpu_util_wake(int cpu, struct task_struct *p) +{ + unsigned long util, capacity; + +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + return cpu_util(cpu); +#endif + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); + + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_util(cpu) - task_util(p), 0); + + return (util >= capacity) ? capacity : util; +} + static int start_cpu(bool boosted) { struct root_domain *rd = cpu_rq(smp_processor_id())->rd; @@ -6203,7 +6231,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - new_util = cpu_util(i) + task_util(p); + new_util = cpu_util_wake(i, p) + task_util(p); /* * Ensure minimum capacity to grant the required boost. @@ -6272,24 +6300,6 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre return target_cpu; } -/* - * cpu_util_wake: Compute cpu utilization with any contributions from - * the waking task p removed. - */ -static int cpu_util_wake(int cpu, struct task_struct *p) -{ - unsigned long util, capacity; - - /* Task has no contribution or is new */ - if (cpu != task_cpu(p) || !p->se.avg.last_update_time) - return cpu_util(cpu); - - capacity = capacity_orig_of(cpu); - util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); - - return (util >= capacity) ? capacity : util; -} - /* * Disable WAKE_AFFINE in the case where task @p doesn't fit in the * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. From 83f462daa328f2f42c3c1f7f5277f71e3fa0f750 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Wed, 8 Mar 2017 13:37:32 +0000 Subject: [PATCH 0868/3220] sched/fair: ensure utilization signals are synchronized before use wake_cap performs task and cpu utilization synchronization which is what allows us to subtract current task util from prev_cpu util and have a sensible number to work with. It looks as though if wake_wide returns 0, we could potentially not execute wake_cap, which would result in unsynced signals we then use for energy calculations. This is not necessarily an issue we've seen in traces, but it looks as though it should be changed. Change-Id: Ic54a3cba2a10d946ea20113a04371dea04115e82 Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27a6f5d6c86c..23e2b5f33ff6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6424,9 +6424,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + if (sd_flag & SD_BALANCE_WAKE) { + /* + * do wake_cap unconditionally as it causes task and cpu + * utilization to be synced, and we need that for energy + * aware wakeups + */ + int _wake_cap = wake_cap(p, cpu, prev_cpu); + want_affine = !wake_wide(p) && !_wake_cap && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + } if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) return select_energy_cpu_brute(p, prev_cpu, sync); From fef0112a63989830cfd60f6ede53014600e751e0 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 3 Mar 2017 11:43:03 +0000 Subject: [PATCH 0869/3220] sched/fair: discount task contribution to find CPU with lowest utilization In some cases, the new_util of a task can be the same on several CPUs. This causes an issue because the target_util is only updated if the current new_util is strictly smaller than target_util. To fix that, the cpu_util_wake() return value is used alongside the new_util value. If two CPUs compute the same new_util value, we'll now also look at their cpu_util_wake() return value. In this case, the CPU that last ran the task will be chosen in priority. Change-Id: Ia1ea2c4b3ec39621372c2f748862317d5b497723 Signed-off-by: Valentin Schneider --- kernel/sched/fair.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 23e2b5f33ff6..fc4e2529fbd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6221,7 +6221,8 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre int i; for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { - unsigned long cur_capacity, new_util; + unsigned long cur_capacity, new_util, wake_util; + unsigned long min_wake_util = ULONG_MAX; if (!cpu_online(i)) continue; @@ -6231,7 +6232,8 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - new_util = cpu_util_wake(i, p) + task_util(p); + wake_util = cpu_util_wake(i, p); + new_util = wake_util + task_util(p); /* * Ensure minimum capacity to grant the required boost. @@ -6266,8 +6268,15 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre * Find a target cpu with the lowest/highest * utilization if prefer_idle/!prefer_idle. */ - if ((prefer_idle && target_util > new_util) || - (!prefer_idle && target_util < new_util)) { + if (prefer_idle) { + /* Favor the CPU that last ran the task */ + if (new_util > target_util || + wake_util > min_wake_util) + continue; + min_wake_util = wake_util; + target_util = new_util; + target_cpu = i; + } else if (target_util < new_util) { target_util = new_util; target_cpu = i; } From fc969e3bfa7aaaee77d91c55807ee5a8c6a26b73 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 6 Feb 2017 16:28:53 +0000 Subject: [PATCH 0870/3220] sched/fair: Fix sched_group_energy() to support per-cpu capacity states sched_group_energy() was supposed to support per-cpu capacity states (DVFS), however, while fixing a hotplug issue this was broken as we bail out if there is no SD_SHARE_CAP_STATES flag set. This patch implements the hotplug race check differently and should therefore reinstate support for per-cpu capacity states. Change-Id: I5b865666c9ce833dcfa6514c574580d75aa0a195 Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc4e2529fbd2..cbd6d12e91fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5442,15 +5442,7 @@ static int sched_group_energy(struct energy_env *eenv) */ sd = rcu_dereference(per_cpu(sd_scs, cpu)); - if (!sd) - /* - * We most probably raced with hotplug; returning a - * wrong energy estimation is better than entering an - * infinite loop. - */ - return -EINVAL; - - if (sd->parent) + if (sd && sd->parent) sg_shared_cap = sd->parent->groups; for_each_domain(cpu, sd) { @@ -5505,6 +5497,14 @@ static int sched_group_energy(struct energy_env *eenv) } while (sg = sg->next, sg != sd->groups); } + + /* + * If we raced with hotplug and got an sd NULL-pointer; + * returning a wrong energy estimation is better than + * entering an infinite loop. + */ + if (cpumask_test_cpu(cpu, &visit_cpus)) + return -EINVAL; next_cpu: cpumask_clear_cpu(cpu, &visit_cpus); continue; From 4c031f0e6fbc0fcea864836356c77b3aa0b29947 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 24 Mar 2017 17:37:28 +0000 Subject: [PATCH 0871/3220] cpufreq/schedutil: use boosted_cpu_util for PELT to match WALT When using WALT we always used boosted cpu util for OPP selection. This is the primary purpose for boosted cpu util, but we hadn't changed the PELT utilization check to do the same thing. Fix that here. Change-Id: Id5ffb26eac23b25fe754255221f6d21b8cededfd Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/cpufreq_schedutil.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 191ba36a9eeb..75bfbb336722 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -19,9 +19,7 @@ #include "sched.h" #include "tune.h" -#ifdef CONFIG_SCHED_WALT unsigned long boosted_cpu_util(int cpu); -#endif /* Stub out fast switch routines present on mainline to reduce the backport * overhead. */ @@ -188,6 +186,15 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, return cpufreq_driver_resolve_freq(policy, freq); } +static inline bool use_pelt(void) +{ +#ifdef CONFIG_SCHED_WALT + return (!sysctl_sched_use_walt_cpu_util || walt_disabled); +#else + return true; +#endif +} + static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) { int cpu = smp_processor_id(); @@ -204,11 +211,10 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) rt = div64_u64(rq->rt_avg, sched_avg_period() + delta); rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT; - *util = min(rq->cfs.avg.util_avg + rt, max_cap); -#ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) - *util = boosted_cpu_util(cpu); -#endif + *util = boosted_cpu_util(cpu); + if (likely(use_pelt())) + *util = min((*util + rt), max_cap); + *max = max_cap; } From 2e829cf17fd296cac853c2e7d2453da4af446948 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 24 Mar 2017 17:40:51 +0000 Subject: [PATCH 0872/3220] sched/tune: increase group count to 5 We use 5 groups everywhere else, this should default to the same. Change-Id: I05a20bdcf8046ea90a2e36979940cef11246e735 Signed-off-by: Chris Redpath Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 079b18802f17..d512b4fb218f 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -205,7 +205,7 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, * implementation especially for the computation of the per-CPU boost * value */ -#define BOOSTGROUPS_COUNT 4 +#define BOOSTGROUPS_COUNT 5 /* Array of configured boostgroups */ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { From f9b83b3e6e723f5568fd8329197523080747d3a8 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 24 Mar 2017 18:56:16 +0000 Subject: [PATCH 0873/3220] sched/tune: fix sched_energy_diff tracepoint sched_energy_diff tracepoint is in a place where it can never trace payoff or nrg.delta. If CONFIG_SCHED_TUNE is enabled, put it in a place where those values exist. If it is not enabled, trace from the current location Change-Id: Id5442f2b34ec76625491d27c0f4285433ca12699 Reported-by: Valentin Schneider Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cbd6d12e91fe..752d7ddd0f3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5574,13 +5574,13 @@ static inline int __energy_diff(struct energy_env *eenv) eenv->nrg.after = energy_after; eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - +#ifndef CONFIG_SCHED_TUNE trace_sched_energy_diff(eenv->task, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); - +#endif /* * Dead-zone margin preventing too many migrations. */ @@ -5651,6 +5651,12 @@ energy_diff(struct energy_env *eenv) eenv->cap.delta, eenv->task); + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); + /* * When SchedTune is enabled, the energy_diff() function will return * the computed energy payoff value. Since the energy_diff() return From 3757f957419c1ccb6fd837ade571375a2cfdc9dc Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 13 Oct 2016 17:34:47 +0100 Subject: [PATCH 0874/3220] sched/tune: report when SchedTune has not been initialized Change-Id: Iba4e5e3d220451f04272d555e6b8e0af83a7f09d Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan --- kernel/sched/tune.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index d512b4fb218f..9770cef183d0 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -939,6 +939,7 @@ schedtune_init(void) return 0; nodata: + pr_warning("schedtune: disabled!\n"); rcu_read_unlock(); return -EINVAL; } From 41d9288e3eace3988a4fc8ce9bffef425a265ed7 Mon Sep 17 00:00:00 2001 From: Srinath Sridharan Date: Tue, 8 Nov 2016 14:53:44 -0800 Subject: [PATCH 0875/3220] sched/tune: Initialize raw_spin_lock in boosted_groups bug: 32668852 Change-Id: Ice96230d88939d5973b1b6310085d1b3df9c47d9 Signed-off-by: Srinath Sridharan --- kernel/sched/tune.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 9770cef183d0..5e81aaad5566 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -640,6 +640,7 @@ schedtune_boostgroup_init(struct schedtune *st) bg = &per_cpu(cpu_boost_groups, cpu); bg->group[st->idx].boost = 0; bg->group[st->idx].tasks = 0; + raw_spin_lock_init(&bg->lock); } return 0; From 7b8577d94ccfdad13cc5c8a48e9643666e8fa167 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 13 Oct 2016 17:31:24 +0100 Subject: [PATCH 0876/3220] sched/{fair,tune}: use reciprocal_value to compute boost margin Change-Id: I493b07360c46eee0b72c2a046dab9ec6cb3427ef Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan --- kernel/sched/fair.c | 23 ++++++----------------- kernel/sched/tune.c | 3 +++ 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 752d7ddd0f3a..cc00d92a3a2e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5805,6 +5805,8 @@ static bool cpu_overutilized(int cpu) #ifdef CONFIG_SCHED_TUNE +struct reciprocal_value schedtune_spc_rdiv; + static long schedtune_margin(unsigned long signal, long boost) { @@ -5815,29 +5817,16 @@ schedtune_margin(unsigned long signal, long boost) * * The Boost (B) value is used to compute a Margin (M) which is * proportional to the complement of the original Signal (S): - * M = B * (SCHED_LOAD_SCALE - S), if B is positive - * M = B * S, if B is negative + * M = B * (SCHED_CAPACITY_SCALE - S) * The obtained M could be used by the caller to "boost" S. */ if (boost >= 0) { - margin = SCHED_LOAD_SCALE - signal; + margin = SCHED_CAPACITY_SCALE - signal; margin *= boost; } else margin = -signal * boost; - /* - * Fast integer division by constant: - * Constant : (C) = 100 - * Precision : 0.1% (P) = 0.1 - * Reference : C * 100 / P (R) = 100000 - * - * Thus: - * Shift bits : ceil(log(R,2)) (S) = 17 - * Mult const : round(2^S/C) (M) = 1311 - * - * - */ - margin *= 1311; - margin >>= 17; + + margin = reciprocal_divide(margin, schedtune_spc_rdiv); if (boost < 0) margin *= -1; diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 5e81aaad5566..493564816679 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -17,6 +17,7 @@ static bool schedtune_initialized = false; unsigned int sysctl_sched_cfs_boost __read_mostly; +extern struct reciprocal_value schedtune_spc_rdiv; extern struct target_nrg schedtune_target_nrg; /* Performance Boost region (B) threshold params */ @@ -937,6 +938,8 @@ schedtune_init(void) pr_info("schedtune: configured to support global boosting only\n"); #endif + schedtune_spc_rdiv = reciprocal_value(100); + return 0; nodata: From 9e3c04bef72b0586e87a2409e0c6d9b96d27da9c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 13 Oct 2016 18:13:20 +0100 Subject: [PATCH 0877/3220] sched/fair: use SCHED_CAPACITY_SCALE for energy normalization Change-Id: I686d26975f4a7dd830ff8441ff986e35461a7d55 Signed-off-by: Patrick Bellasi Signed-off-by: Srinath Sridharan --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cc00d92a3a2e..ef4dc3281e22 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5600,7 +5600,7 @@ struct target_nrg schedtune_target_nrg; /* * System energy normalization - * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], * corresponding to the specified energy variation. */ static inline int @@ -5620,7 +5620,7 @@ normalize_energy(int energy_diff) normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; /* Scale by energy magnitude */ - normalized_nrg <<= SCHED_LOAD_SHIFT; + normalized_nrg <<= SCHED_CAPACITY_SHIFT; /* Normalize on max energy for target platform */ normalized_nrg = reciprocal_divide( From c47d00b57b26a5dc9bca89375dd8a80ee1a7cc5b Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 27 Mar 2017 18:20:20 +0100 Subject: [PATCH 0878/3220] sched/tune: don't use schedtune before it is ready When EAS is enabled during boot, we have to be careful not to use schedtune from fair.c before it is ready or it will warn us and we'll get a traceback in the console. Change-Id: I1a5cf29b18af626545c636c51219f9ed497c19fa Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 9 ++++++++- kernel/sched/tune.c | 8 +++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef4dc3281e22..226d1990eea9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5597,7 +5597,7 @@ static inline int __energy_diff(struct energy_env *eenv) #ifdef CONFIG_SCHED_TUNE struct target_nrg schedtune_target_nrg; - +extern bool schedtune_initialized; /* * System energy normalization * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], @@ -5607,13 +5607,20 @@ static inline int normalize_energy(int energy_diff) { u32 normalized_nrg; + + /* during early setup, we don't know the extents */ + if (unlikely(!schedtune_initialized)) + return energy_diff < 0 ? -1 : 1 ; + #ifdef CONFIG_SCHED_DEBUG + { int max_delta; /* Check for boundaries */ max_delta = schedtune_target_nrg.max_power; max_delta -= schedtune_target_nrg.min_power; WARN_ON(abs(energy_diff) >= max_delta); + } #endif /* Do scaling using positive numbers to increase the range */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 493564816679..86d04caf1684 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -12,7 +12,7 @@ #include "tune.h" #ifdef CONFIG_CGROUP_SCHEDTUNE -static bool schedtune_initialized = false; +bool schedtune_initialized = false; #endif unsigned int sysctl_sched_cfs_boost __read_mostly; @@ -527,6 +527,9 @@ int schedtune_task_boost(struct task_struct *p) struct schedtune *st; int task_boost; + if (!unlikely(schedtune_initialized)) + return 0; + /* Get task boost value */ rcu_read_lock(); st = task_schedtune(p); @@ -541,6 +544,9 @@ int schedtune_prefer_idle(struct task_struct *p) struct schedtune *st; int prefer_idle; + if (!unlikely(schedtune_initialized)) + return 0; + /* Get prefer_idle value */ rcu_read_lock(); st = task_schedtune(p); From 8cbdbb9a6aa5ce9d53d2c7cf4aff2831e5b724f9 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 11 May 2017 16:15:15 -0700 Subject: [PATCH 0879/3220] FROMLIST: f2fs: sanity check checkpoint segno and blkoff Make sure segno and blkoff read from raw image are valid. (url https://sourceforge.net/p/linux-f2fs/mailman/message/35835945) Signed-off-by: Jin Qian Bug: 36588520 Change-Id: Iba66ab97d3d0870ea48b5ef192d9075f225a934a --- fs/f2fs/super.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 86e1cb899957..2ac3417d9412 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1078,6 +1078,8 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1089,6 +1091,22 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + main_segs = le32_to_cpu(sbi->raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) { + return 1; + } + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) { + return 1; + } + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From 57e54f412025c46954ae82d67e99f899d9882043 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Fri, 2 Jun 2017 20:22:07 +0000 Subject: [PATCH 0880/3220] BACKPORT: f2fs: sanity check size of nat and sit cache Make sure number of entires doesn't exceed max journal size. Cc: stable@vger.kernel.org Signed-off-by: Jin Qian Signed-off-by: Jaegeuk Kim (url https://sourceforge.net/p/linux-f2fs/mailman/message/35872032) Bug: 36819470 Change-Id: I0db498cdc996ec204f56617bba5c4ab5ac6ade14 --- fs/f2fs/segment.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f77b3258454a..7965957dd0e6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1623,6 +1623,10 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { + struct f2fs_summary_block *s_sits = + CURSEG_I(sbi, CURSEG_COLD_DATA)->sum_blk; + struct f2fs_summary_block *s_nats = + CURSEG_I(sbi, CURSEG_HOT_DATA)->sum_blk; int type = CURSEG_HOT_DATA; int err; @@ -1649,6 +1653,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } + /* sanity check for summary blocks */ + if (nats_in_cursum(s_nats) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(s_sits) > SIT_JOURNAL_ENTRIES) + return -EINVAL; + return 0; } From fce0ecf04a7e0b6f912151607758eb2e98888569 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Sat, 3 Jun 2017 15:03:03 +0100 Subject: [PATCH 0881/3220] schedstats/eas: guard properly to avoid breaking non-smp schedstats users Add appropriate #ifdef guards to ensure the smp-only easstats structs are not used when smp is not enabled. Arnd got a report from buildbot, analysed it, and pointed out exactly what the issue was. Reported-by: "Arnd Bergmann" Suggested-by: "Arnd Bergmann" Fixes: 4b85765a3dd9 ("sched/fair: Add eas (& cas) specific rq, sd and task stats") Signed-off-by: Chris Redpath Change-Id: I60554dea20137f6774db3f59b4afd40a06554cfc --- kernel/sched/sched.h | 3 ++- kernel/sched/stats.c | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6880fbc39760..ce364ddbb72c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -734,9 +734,10 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; - +#ifdef CONFIG_SMP struct eas_stats eas_stats; #endif +#endif #ifdef CONFIG_SMP struct llist_head wake_list; diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index b63879918cd6..6d74a7c77c8c 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -12,6 +12,7 @@ */ #define SCHEDSTAT_VERSION 15 +#ifdef CONFIG_SMP static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats) { /* eas-specific runqueue stats */ @@ -31,6 +32,7 @@ static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats) seq_printf(seq, "%llu %llu\n", stats->cas_attempts, stats->cas_count); } +#endif static int show_schedstat(struct seq_file *seq, void *v) { @@ -59,8 +61,9 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); - show_easstat(seq, &rq->eas_stats); #ifdef CONFIG_SMP + show_easstat(seq, &rq->eas_stats); + /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { From 5c31a5e9a365d1111568ac0484c0dda0e4bd09fe Mon Sep 17 00:00:00 2001 From: Roberto Pereira Date: Mon, 5 Jun 2017 15:26:49 -0700 Subject: [PATCH 0882/3220] android: base-cfg: disable CONFIG_NFS_FS and CONFIG_NFSD Signed-off-by: Roberto Pereira Bug:37753761 Change-Id: I1b96d7baa329dad0400c6e5c3fb12e81f1251a62 --- android/configs/android-base.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index b7b997fd58c9..b974d01b84af 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -3,6 +3,8 @@ # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set +# CONFIG_NFSD is not set +# CONFIG_NFS_FS is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set From 4fd2931f74468752711044528129e1155bde82fd Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 7 Jun 2017 12:44:50 -0700 Subject: [PATCH 0883/3220] ANDROID: sdcardfs: d_splice_alias can return error values We must check that d_splice_alias was successful before using its output. Signed-off-by: Daniel Rosenberg Bug: 62390017 Change-Id: Ifda0a052fb3f67e35c635a4e5e907876c5400978 --- fs/sdcardfs/lookup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c index 00ae21151f52..676e394e07be 100644 --- a/fs/sdcardfs/lookup.c +++ b/fs/sdcardfs/lookup.c @@ -199,7 +199,8 @@ static struct dentry *__sdcardfs_interpose(struct dentry *dentry, ret_dentry = d_splice_alias(inode, dentry); dentry = ret_dentry ?: dentry; - update_derived_permission_lock(dentry); + if (!IS_ERR(dentry)) + update_derived_permission_lock(dentry); out: return ret_dentry; } From 23874bf0d9eca04e7a537275f67d4d639318aa77 Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Thu, 25 May 2017 15:20:29 +0800 Subject: [PATCH 0884/3220] ANDROID: uid_sys_stats: check previous uid_entry before call find_or_register_uid Theads in a process are stored in list struct task_struct->thread_group, so it will be visited continiously in below loop: do_each_thread(temp, task) { ... } while_each_thread(temp, task); I add some log in the loop, we can see below information: [ 65.033561] uid 1000, uid_entry ffffffc0f2761600 [ 65.033567] uid 1000, uid_entry ffffffc0f2761600 [ 65.033574] uid 1000, uid_entry ffffffc0f2761600 [ 65.033581] uid 1000, uid_entry ffffffc0f2761600 [ 65.033588] uid 1000, uid_entry ffffffc0f2761600 [ 65.033595] uid 1000, uid_entry ffffffc0f2761600 [ 65.033602] uid 1000, uid_entry ffffffc0f2761600 [ 65.033609] uid 1000, uid_entry ffffffc0f2761600 [ 65.033615] uid 1000, uid_entry ffffffc0f2761600 [ 65.033622] uid 1000, uid_entry ffffffc0f2761600 [ 65.033629] uid 1000, uid_entry ffffffc0f2761600 [ 65.033637] uid 1000, uid_entry ffffffc0f2761600 [ 65.033644] uid 1000, uid_entry ffffffc0f2761600 [ 65.033651] uid 1000, uid_entry ffffffc0f2761600 [ 65.033658] uid 1000, uid_entry ffffffc0f2761600 [ 65.033665] uid 1000, uid_entry ffffffc0f2761600 [ 65.033672] uid 1000, uid_entry ffffffc0f2761600 [ 65.033680] uid 1000, uid_entry ffffffc0f2761600 [ 65.033687] uid 1000, uid_entry ffffffc0f2761600 [ 65.033694] uid 1000, uid_entry ffffffc0f2761600 [ 65.033701] uid 1000, uid_entry ffffffc0f2761600 [ 65.033708] uid 1000, uid_entry ffffffc0f2761600 [ 65.033715] uid 1000, uid_entry ffffffc0f2761600 [ 65.033722] uid 1000, uid_entry ffffffc0f2761600 [ 65.033729] uid 1000, uid_entry ffffffc0f2761600 [ 65.033736] uid 1000, uid_entry ffffffc0f2761600 [ 65.033743] uid 1000, uid_entry ffffffc0f2761600 [ 65.033750] uid 1000, uid_entry ffffffc0f2761600 [ 65.033757] uid 1000, uid_entry ffffffc0f2761600 [ 65.033763] uid 1000, uid_entry ffffffc0f2761600 [ 65.033770] uid 1000, uid_entry ffffffc0f2761600 [ 65.033777] uid 1000, uid_entry ffffffc0f2761600 [ 65.033784] uid 1000, uid_entry ffffffc0f2761600 [ 65.033791] uid 1000, uid_entry ffffffc0f2761600 [ 65.033798] uid 1000, uid_entry ffffffc0f2761600 So we can check the previous uid_entry before calling find_or_register_uid to save time. Change-Id: I05ec1a1405a80c0a620cb4b4b2f6483dbfde7829 Signed-off-by: Ganesh Mahendran --- drivers/misc/uid_sys_stats.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 091370f4ea40..3c9d311106cd 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -94,7 +94,7 @@ static struct uid_entry *find_or_register_uid(uid_t uid) static int uid_cputime_show(struct seq_file *m, void *v) { - struct uid_entry *uid_entry; + struct uid_entry *uid_entry = NULL; struct task_struct *task, *temp; struct user_namespace *user_ns = current_user_ns(); cputime_t utime; @@ -112,7 +112,8 @@ static int uid_cputime_show(struct seq_file *m, void *v) read_lock(&tasklist_lock); do_each_thread(temp, task) { uid = from_kuid_munged(user_ns, task_uid(task)); - uid_entry = find_or_register_uid(uid); + if (!uid_entry || uid_entry->uid != uid) + uid_entry = find_or_register_uid(uid); if (!uid_entry) { read_unlock(&tasklist_lock); rt_mutex_unlock(&uid_lock); @@ -251,7 +252,7 @@ static void compute_uid_io_bucket_stats(struct io_stats *io_bucket, static void update_io_stats_all_locked(void) { - struct uid_entry *uid_entry; + struct uid_entry *uid_entry = NULL; struct task_struct *task, *temp; struct user_namespace *user_ns = current_user_ns(); unsigned long bkt; @@ -264,7 +265,8 @@ static void update_io_stats_all_locked(void) rcu_read_lock(); do_each_thread(temp, task) { uid = from_kuid_munged(user_ns, task_uid(task)); - uid_entry = find_or_register_uid(uid); + if (!uid_entry || uid_entry->uid != uid) + uid_entry = find_or_register_uid(uid); if (!uid_entry) continue; add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR); From d3135a2e661474a6c04c2957226e168004dc1824 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 8 Dec 2016 19:55:22 +0800 Subject: [PATCH 0885/3220] usb: gadget: f_fs: Fix possibe deadlock When system try to close /dev/usb-ffs/adb/ep0 on one core, at the same time another core try to attach new UDC, which will cause deadlock as below scenario. Thus we should release ffs lock before issuing unregister_gadget_item(). [ 52.642225] c1 ====================================================== [ 52.642228] c1 [ INFO: possible circular locking dependency detected ] [ 52.642236] c1 4.4.6+ #1 Tainted: G W O [ 52.642241] c1 ------------------------------------------------------- [ 52.642245] c1 usb ffs open/2808 is trying to acquire lock: [ 52.642270] c0 (udc_lock){+.+.+.}, at: [] usb_gadget_unregister_driver+0x3c/0xc8 [ 52.642272] c1 but task is already holding lock: [ 52.642283] c0 (ffs_lock){+.+.+.}, at: [] ffs_data_clear+0x30/0x140 [ 52.642285] c1 which lock already depends on the new lock. [ 52.642287] c1 the existing dependency chain (in reverse order) is: [ 52.642295] c0 -> #1 (ffs_lock){+.+.+.}: [ 52.642307] c0 [] __lock_acquire+0x20f0/0x2238 [ 52.642314] c0 [] lock_acquire+0xe4/0x298 [ 52.642322] c0 [] mutex_lock_nested+0x7c/0x3cc [ 52.642328] c0 [] ffs_func_bind+0x504/0x6e8 [ 52.642334] c0 [] usb_add_function+0x84/0x184 [ 52.642340] c0 [] configfs_composite_bind+0x264/0x39c [ 52.642346] c0 [] udc_bind_to_driver+0x58/0x11c [ 52.642352] c0 [] usb_udc_attach_driver+0x90/0xc8 [ 52.642358] c0 [] gadget_dev_desc_UDC_store+0xd4/0x128 [ 52.642369] c0 [] configfs_write_file+0xd0/0x13c [ 52.642376] c0 [] vfs_write+0xb8/0x214 [ 52.642381] c0 [] SyS_write+0x54/0xb0 [ 52.642388] c0 [] el0_svc_naked+0x24/0x28 [ 52.642395] c0 -> #0 (udc_lock){+.+.+.}: [ 52.642401] c0 [] print_circular_bug+0x84/0x2e4 [ 52.642407] c0 [] __lock_acquire+0x2138/0x2238 [ 52.642412] c0 [] lock_acquire+0xe4/0x298 [ 52.642420] c0 [] mutex_lock_nested+0x7c/0x3cc [ 52.642427] c0 [] usb_gadget_unregister_driver+0x3c/0xc8 [ 52.642432] c0 [] unregister_gadget_item+0x28/0x44 [ 52.642439] c0 [] ffs_data_clear+0x138/0x140 [ 52.642444] c0 [] ffs_data_reset+0x20/0x6c [ 52.642450] c0 [] ffs_data_closed+0xac/0x12c [ 52.642454] c0 [] ffs_ep0_release+0x20/0x2c [ 52.642460] c0 [] __fput+0xb0/0x1f4 [ 52.642466] c0 [] ____fput+0x20/0x2c [ 52.642473] c0 [] task_work_run+0xb4/0xe8 [ 52.642482] c0 [] do_exit+0x360/0xb9c [ 52.642487] c0 [] do_group_exit+0x4c/0xb0 [ 52.642494] c0 [] get_signal+0x380/0x89c [ 52.642501] c0 [] do_signal+0x154/0x518 [ 52.642507] c0 [] do_notify_resume+0x70/0x78 [ 52.642512] c0 [] work_pending+0x1c/0x20 [ 52.642514] c1 other info that might help us debug this: [ 52.642517] c1 Possible unsafe locking scenario: [ 52.642518] c1 CPU0 CPU1 [ 52.642520] c1 ---- ---- [ 52.642525] c0 lock(ffs_lock); [ 52.642529] c0 lock(udc_lock); [ 52.642533] c0 lock(ffs_lock); [ 52.642537] c0 lock(udc_lock); [ 52.642539] c1 *** DEADLOCK *** [ 52.642543] c1 1 lock held by usb ffs open/2808: [ 52.642555] c0 #0: (ffs_lock){+.+.+.}, at: [] ffs_data_clear+0x30/0x140 [ 52.642557] c1 stack backtrace: [ 52.642563] c1 CPU: 1 PID: 2808 Comm: usb ffs open Tainted: G [ 52.642565] c1 Hardware name: Spreadtrum SP9860g Board (DT) [ 52.642568] c1 Call trace: [ 52.642573] c1 [] dump_backtrace+0x0/0x170 [ 52.642577] c1 [] show_stack+0x20/0x28 [ 52.642583] c1 [] dump_stack+0xa8/0xe0 [ 52.642587] c1 [] print_circular_bug+0x1fc/0x2e4 [ 52.642591] c1 [] __lock_acquire+0x2138/0x2238 [ 52.642595] c1 [] lock_acquire+0xe4/0x298 [ 52.642599] c1 [] mutex_lock_nested+0x7c/0x3cc [ 52.642604] c1 [] usb_gadget_unregister_driver+0x3c/0xc8 [ 52.642608] c1 [] unregister_gadget_item+0x28/0x44 [ 52.642613] c1 [] ffs_data_clear+0x138/0x140 [ 52.642618] c1 [] ffs_data_reset+0x20/0x6c [ 52.642621] c1 [] ffs_data_closed+0xac/0x12c [ 52.642625] c1 [] ffs_ep0_release+0x20/0x2c [ 52.642629] c1 [] __fput+0xb0/0x1f4 [ 52.642633] c1 [] ____fput+0x20/0x2c [ 52.642636] c1 [] task_work_run+0xb4/0xe8 [ 52.642640] c1 [] do_exit+0x360/0xb9c [ 52.642644] c1 [] do_group_exit+0x4c/0xb0 [ 52.642647] c1 [] get_signal+0x380/0x89c [ 52.642651] c1 [] do_signal+0x154/0x518 [ 52.642656] c1 [] do_notify_resume+0x70/0x78 [ 52.642659] c1 [] work_pending+0x1c/0x20 Bug: 62572621 Change-Id: I2f6bea38c449ed56234a6e873d3cb33c2e83ff29 Acked-by: Michal Nazarewicz Signed-off-by: Baolin Wang Signed-off-by: Felipe Balbi (cherry picked from commit b3ce3ce02d146841af012d08506b4071db8ffde3) --- drivers/usb/gadget/function/f_fs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 9ad5145d3103..194a54991497 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -3463,6 +3463,7 @@ static void ffs_closed(struct ffs_data *ffs) { struct ffs_dev *ffs_obj; struct f_fs_opts *opts; + struct config_item *ci; ENTER(); ffs_dev_lock(); @@ -3486,8 +3487,11 @@ static void ffs_closed(struct ffs_data *ffs) || !atomic_read(&opts->func_inst.group.cg_item.ci_kref.refcount)) goto done; - unregister_gadget_item(ffs_obj->opts-> - func_inst.group.cg_item.ci_parent->ci_parent); + ci = opts->func_inst.group.cg_item.ci_parent->ci_parent; + ffs_dev_unlock(); + + unregister_gadget_item(ci); + return; done: ffs_dev_unlock(); } From 44b35c3df9748f9caeab7915b7eacce713311fd0 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Tue, 13 Jun 2017 09:23:25 -0700 Subject: [PATCH 0886/3220] ANDROID: android-base.cfg: split out arm64-specific configs These config options are specific to arm64 so should not be universally required. Bug: 62523096 Change-Id: I52bcad68f32d5314032c6aa3f37402b2ffba79be Signed-off-by: Steve Muckle --- android/configs/android-base-arm64.cfg | 5 +++++ android/configs/android-base.cfg | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 android/configs/android-base-arm64.cfg diff --git a/android/configs/android-base-arm64.cfg b/android/configs/android-base-arm64.cfg new file mode 100644 index 000000000000..43f23d6b5391 --- /dev/null +++ b/android/configs/android-base-arm64.cfg @@ -0,0 +1,5 @@ +# KEEP ALPHABETICALLY SORTED +CONFIG_ARMV8_DEPRECATED=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_SETEND_EMULATION=y +CONFIG_SWP_EMULATION=y diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index b974d01b84af..d675e712fe8c 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -12,7 +12,6 @@ CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_LOW_MEMORY_KILLER=y -CONFIG_ARMV8_DEPRECATED=y CONFIG_ASHMEM=y CONFIG_AUDIT=y CONFIG_BLK_DEV_INITRD=y @@ -21,7 +20,6 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y -CONFIG_CP15_BARRIER_EMULATION=y CONFIG_DEFAULT_SECURITY_SELINUX=y CONFIG_EMBEDDED=y CONFIG_FB=y @@ -155,9 +153,7 @@ CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y CONFIG_SECURITY_SELINUX=y -CONFIG_SETEND_EMULATION=y CONFIG_STAGING=y -CONFIG_SWP_EMULATION=y CONFIG_SYNC=y CONFIG_TUN=y CONFIG_UID_SYS_STATS=y From d769a181e9f03ad35cd3e61943e684b1f50a0e11 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Tue, 16 May 2017 16:48:49 -0700 Subject: [PATCH 0887/3220] ANDROID: sdcardfs: remove dead function open_flags_to_access_mode() smatch warns about the suspicious formatting in the last line of open_flags_to_access_mode(). It turns out the only caller was deleted over a year ago by "ANDROID: sdcardfs: Bring up to date with Android M permissions:", so we can "fix" the function's formatting by deleting it. Change-Id: Id85946f3eb01722eef35b1815f405a6fda3aa4ff Signed-off-by: Greg Hackmann --- fs/sdcardfs/packagelist.c | 13 ------------- fs/sdcardfs/sdcardfs.h | 1 - 2 files changed, 14 deletions(-) diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c index 8495474258d5..66c6f8e72a02 100644 --- a/fs/sdcardfs/packagelist.c +++ b/fs/sdcardfs/packagelist.c @@ -174,19 +174,6 @@ int check_caller_access_to_name(struct inode *parent_node, const struct qstr *na return 1; } -/* This function is used when file opening. The open flags must be - * checked before calling check_caller_access_to_name() - */ -int open_flags_to_access_mode(int open_flags) -{ - if ((open_flags & O_ACCMODE) == O_RDONLY) - return 0; /* R_OK */ - if ((open_flags & O_ACCMODE) == O_WRONLY) - return 1; /* W_OK */ - /* Probably O_RDRW, but treat as default to be safe */ - return 1; /* R_OK | W_OK */ -} - static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key, appid_t value) { diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index 31d37d7da4f9..b8624fd8b817 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -499,7 +499,6 @@ extern appid_t get_appid(const char *app_name); extern appid_t get_ext_gid(const char *app_name); extern appid_t is_excluded(const char *app_name, userid_t userid); extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name); -extern int open_flags_to_access_mode(int open_flags); extern int packagelist_init(void); extern void packagelist_exit(void); From 1200efcca9b5174bc8de5ac8440f49fab3bcd0f8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 24 Apr 2016 00:56:03 -0400 Subject: [PATCH 0888/3220] BACKPORT: ext4: fix data exposure after a crash Huang has reported that in his powerfail testing he is seeing stale block contents in some of recently allocated blocks although he mounts ext4 in data=ordered mode. After some investigation I have found out that indeed when delayed allocation is used, we don't add inode to transaction's list of inodes needing flushing before commit. Originally we were doing that but commit f3b59291a69d removed the logic with a flawed argument that it is not needed. The problem is that although for delayed allocated blocks we write their contents immediately after allocating them, there is no guarantee that the IO scheduler or device doesn't reorder things and thus transaction allocating blocks and attaching them to inode can reach stable storage before actual block contents. Actually whenever we attach freshly allocated blocks to inode using a written extent, we should add inode to transaction's ordered inode list to make sure we properly wait for block contents to be written before committing the transaction. So that is what we do in this patch. This also handles other cases where stale data exposure was possible - like filling hole via mmap in data=ordered,nodelalloc mode. The only exception to the above rule are extending direct IO writes where blkdev_direct_IO() waits for IO to complete before increasing i_size and thus stale data exposure is not possible. For now we don't complicate the code with optimizing this special case since the overhead is pretty low. In case this is observed to be a performance problem we can always handle it using a special flag to ext4_map_blocks(). CC: stable@vger.kernel.org Fixes: f3b59291a69d0b734be1fc8be489fef2dd846d3d Reported-by: "HUANG Weller (CM/ESW12-CN)" Tested-by: "HUANG Weller (CM/ESW12-CN)" Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o (cherry picked from commit 06bd3c36a733ac27962fea7d6f47168841376824) Signed-off-by: Connor O'Brien Bug: 62198330 Change-Id: Idc78b64e4f23e6085301c60057af6029b49a8193 --- fs/ext4/inode.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e3d425eeab4a..2ac719290869 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -659,6 +659,20 @@ has_zeroout: ret = check_block_validity(inode, map); if (ret != 0) return ret; + + /* + * Inodes with freshly allocated blocks where contents will be + * visible after transaction commit must be on transaction's + * ordered data list. + */ + if (map->m_flags & EXT4_MAP_NEW && + !(map->m_flags & EXT4_MAP_UNWRITTEN) && + !IS_NOQUOTA(inode) && + ext4_should_order_data(inode)) { + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) + return ret; + } } return retval; } @@ -1164,15 +1178,6 @@ static int ext4_write_end(struct file *file, trace_android_fs_datawrite_end(inode, pos, len); trace_ext4_write_end(inode, pos, len, copied); - if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { - ret = ext4_jbd2_file_inode(handle, inode); - if (ret) { - unlock_page(page); - page_cache_release(page); - goto errout; - } - } - if (ext4_has_inline_data(inode)) { ret = ext4_write_inline_data_end(inode, pos, len, copied, page); From 6bb6b3e686cc90cd4b75541a6f1c95a98e0377bc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 May 2017 00:04:09 +0200 Subject: [PATCH 0889/3220] UPSTREAM: bpf: don't let ldimm64 leak map addresses on unprivileged [ Upstream commit 0d0e57697f162da4aa218b5feafe614fb666db07 ] The patch fixes two things at once: 1) It checks the env->allow_ptr_leaks and only prints the map address to the log if we have the privileges to do so, otherwise it just dumps 0 as we would when kptr_restrict is enabled on %pK. Given the latter is off by default and not every distro sets it, I don't want to rely on this, hence the 0 by default for unprivileged. 2) Printing of ldimm64 in the verifier log is currently broken in that we don't print the full immediate, but only the 32 bit part of the first insn part for ldimm64. Thus, fix this up as well; it's okay to access, since we verified all ldimm64 earlier already (including just constants) through replace_map_fd_with_map_ptr(). Fixes: 1be7f75d1668 ("bpf: enable non-root eBPF programs") Fixes: cbd357008604 ("bpf: verifier (add ability to receive verification log)") Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Bug: 62199770 Change-Id: I62ee47d06ddc669ba2863e8cf24f8f3e7683a461 --- kernel/bpf/verifier.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cbfba78d3db..85de5094b936 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -313,7 +313,8 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_insn(struct bpf_insn *insn) +static void print_bpf_insn(const struct verifier_env *env, + const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); @@ -377,9 +378,19 @@ static void print_bpf_insn(struct bpf_insn *insn) insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM) { - verbose("(%02x) r%d = 0x%x\n", - insn->code, insn->dst_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !env->allow_ptr_leaks) + imm = 0; + + verbose("(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); } else { verbose("BUG_ld_%02x\n", insn->code); return; @@ -1758,7 +1769,7 @@ static int do_check(struct verifier_env *env) if (log_level) { verbose("%d: ", insn_idx); - print_bpf_insn(insn); + print_bpf_insn(env, insn); } if (class == BPF_ALU || class == BPF_ALU64) { From fa60377966b77928967294ee8d0cbce93d6648ad Mon Sep 17 00:00:00 2001 From: William Wu Date: Tue, 25 Apr 2017 17:45:48 +0800 Subject: [PATCH 0890/3220] UPSTREAM: usb: gadget: f_fs: avoid out of bounds access on comp_desc Companion descriptor is only used for SuperSpeed endpoints, if the endpoints are HighSpeed or FullSpeed, the Companion descriptor will not allocated, so we can only access it if gadget is SuperSpeed. I can reproduce this issue on Rockchip platform rk3368 SoC which supports USB 2.0, and use functionfs for ADB. Kernel build with CONFIG_KASAN=y and CONFIG_SLUB_DEBUG=y report the following BUG: ================================================================== BUG: KASAN: slab-out-of-bounds in ffs_func_set_alt+0x224/0x3a0 at addr ffffffc0601f6509 Read of size 1 by task swapper/0/0 ============================================================================ BUG kmalloc-256 (Not tainted): kasan: bad access detected ---------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in ffs_func_bind+0x52c/0x99c age=1275 cpu=0 pid=1 alloc_debug_processing+0x128/0x17c ___slab_alloc.constprop.58+0x50c/0x610 __slab_alloc.isra.55.constprop.57+0x24/0x34 __kmalloc+0xe0/0x250 ffs_func_bind+0x52c/0x99c usb_add_function+0xd8/0x1d4 configfs_composite_bind+0x48c/0x570 udc_bind_to_driver+0x6c/0x170 usb_udc_attach_driver+0xa4/0xd0 gadget_dev_desc_UDC_store+0xcc/0x118 configfs_write_file+0x1a0/0x1f8 __vfs_write+0x64/0x174 vfs_write+0xe4/0x200 SyS_write+0x68/0xc8 el0_svc_naked+0x24/0x28 INFO: Freed in inode_doinit_with_dentry+0x3f0/0x7c4 age=1275 cpu=7 pid=247 ... Call trace: [] dump_backtrace+0x0/0x230 [] show_stack+0x14/0x1c [] dump_stack+0xa0/0xc8 [] print_trailer+0x188/0x198 [] object_err+0x3c/0x4c [] kasan_report+0x324/0x4dc [] __asan_load1+0x24/0x50 [] ffs_func_set_alt+0x224/0x3a0 [] composite_setup+0xdcc/0x1ac8 [] android_setup+0x124/0x1a0 [] _setup+0x54/0x74 [] handle_ep0+0x3288/0x4390 [] dwc_otg_pcd_handle_out_ep_intr+0x14dc/0x2ae4 [] dwc_otg_pcd_handle_intr+0x1ec/0x298 [] dwc_otg_pcd_irq+0x10/0x20 [] handle_irq_event_percpu+0x124/0x3ac [] handle_irq_event+0x60/0xa0 [] handle_fasteoi_irq+0x10c/0x1d4 [] generic_handle_irq+0x30/0x40 [] __handle_domain_irq+0xac/0xdc [] gic_handle_irq+0x64/0xa4 ... Memory state around the buggy address: ffffffc0601f6400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffffffc0601f6480: 00 00 00 00 00 00 00 00 00 00 06 fc fc fc fc fc >ffffffc0601f6500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ^ ffffffc0601f6580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffffffc0601f6600: fc fc fc fc fc fc fc fc 00 00 00 00 00 00 00 00 ================================================================== Signed-off-by: William Wu Signed-off-by: Felipe Balbi (cherry picked from commit b7f73850bb4fac1e2209a4dd5e636d39be92f42c) Signed-off-by: Jerry Zhang Signed-off-by: Jerry Zhang --- drivers/usb/gadget/function/f_fs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 194a54991497..732e6ed5d7b4 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1668,12 +1668,12 @@ static int ffs_func_eps_enable(struct ffs_function *func) ep->ep->driver_data = ep; ep->ep->desc = ds; - comp_desc = (struct usb_ss_ep_comp_descriptor *)(ds + - USB_DT_ENDPOINT_SIZE); - ep->ep->maxburst = comp_desc->bMaxBurst + 1; - - if (needs_comp_desc) + if (needs_comp_desc) { + comp_desc = (struct usb_ss_ep_comp_descriptor *)(ds + + USB_DT_ENDPOINT_SIZE); + ep->ep->maxburst = comp_desc->bMaxBurst + 1; ep->ep->comp_desc = comp_desc; + } ret = usb_ep_enable(ep->ep); if (likely(!ret)) { From 575a44c7cc5e526ff0311eb66d5c6cb2d931406c Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 20 Jun 2017 17:05:33 -0700 Subject: [PATCH 0891/3220] ANDROID: squashfs: Fix signed division issue The value here can change depending on the type that PAGE_SIZE has on a given architecture. To avoid the ensuing signed and unsigned division conversions, we shift instead using PAGE_SHIFT Signed-off-by: Daniel Rosenberg Bug: 35257858 Change-Id: I132cae93abea39390c3f0f91a4b2e026e97ed4c7 --- fs/squashfs/block.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2eb66decc5ab..4e3e0863f5ea 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -211,8 +211,8 @@ static int bh_is_optional(struct squashfs_read_request *req, int idx) int start_idx, end_idx; struct squashfs_sb_info *msblk = req->sb->s_fs_info; - start_idx = (idx * msblk->devblksize - req->offset) / PAGE_CACHE_SIZE; - end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) / PAGE_CACHE_SIZE; + start_idx = (idx * msblk->devblksize - req->offset) >> PAGE_SHIFT; + end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) >> PAGE_SHIFT; if (start_idx >= req->output->pages) return 1; if (start_idx < 0) From 16759392d588c6042e4496943ba718ef403e9735 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 26 Jun 2017 18:21:56 -0700 Subject: [PATCH 0892/3220] ANDROID: squashfs: Fix endianness issue Code in squashfs_process_blocks was not correctly assigning length. Casting to u16* introduced endianness issues on some architectures. Signed-off-by: Daniel Rosenberg Bug: 35257858 Change-Id: I9efaef4bc531b7469de79cf94738ade2dd6e6a8c --- fs/squashfs/block.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 4e3e0863f5ea..b3b95e2ae2ff 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -121,11 +121,12 @@ static void squashfs_process_blocks(struct squashfs_read_request *req) if (req->data_processing == SQUASHFS_METADATA) { /* Extract the length of the metadata block */ - if (req->offset != msblk->devblksize - 1) - length = *((u16 *)(bh[0]->b_data + req->offset)); - else { - length = bh[0]->b_data[req->offset]; - length |= bh[1]->b_data[0] << 8; + if (req->offset != msblk->devblksize - 1) { + length = le16_to_cpup((__le16 *) + (bh[0]->b_data + req->offset)); + } else { + length = (unsigned char)bh[0]->b_data[req->offset]; + length |= (unsigned char)bh[1]->b_data[0] << 8; } req->compressed = SQUASHFS_COMPRESSED(length); req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS From fbea122e3ff597c35adbdad04415765b26be8d85 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 23 Feb 2016 18:22:39 +0000 Subject: [PATCH 0893/3220] UPSTREAM: drivers/perf: arm_pmu: implement CPU_PM notifier (cherry picked from commit da4e4f18afe0f3729d68f3785c5802f786d36e34) When a CPU is suspended (either through suspend-to-RAM or CPUidle), its PMU registers content can be lost, which means that counters registers values that were initialized on power down entry have to be reprogrammed on power-up to make sure the counters set-up is preserved (ie on power-up registers take the reset values on Cold or Warm reset, which can be architecturally UNKNOWN). To guarantee seamless profiling conditions across a core power down this patch adds a CPU PM notifier to ARM pmus, that upon CPU PM entry/exit from low-power states saves/restores the pmu registers set-up (by using the ARM perf API), so that the power-down/up cycle does not affect the perf behaviour (apart from a black-out period between power-up/down CPU PM notifications that is unavoidable). Change-Id: Ifbd73b82ca9dc172c58e2488cda1af9af975b14f Cc: Will Deacon Cc: Sudeep Holla Cc: Daniel Lezcano Cc: Mathieu Poirier Cc: Mark Rutland Acked-by: Ashwin Chaugule Acked-by: Kevin Hilman Signed-off-by: Lorenzo Pieralisi Signed-off-by: Will Deacon --- drivers/perf/arm_pmu.c | 95 ++++++++++++++++++++++++++++++++++++ include/linux/perf/arm_pmu.h | 1 + 2 files changed, 96 insertions(+) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 8af1f900ea65..85b445c50fee 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -718,6 +719,93 @@ static int cpu_pmu_notify(struct notifier_block *b, unsigned long action, return NOTIFY_OK; } +#ifdef CONFIG_CPU_PM +static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd) +{ + struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); + struct perf_event *event; + int idx; + + for (idx = 0; idx < armpmu->num_events; idx++) { + /* + * If the counter is not used skip it, there is no + * need of stopping/restarting it. + */ + if (!test_bit(idx, hw_events->used_mask)) + continue; + + event = hw_events->events[idx]; + + switch (cmd) { + case CPU_PM_ENTER: + /* + * Stop and update the counter + */ + armpmu_stop(event, PERF_EF_UPDATE); + break; + case CPU_PM_EXIT: + case CPU_PM_ENTER_FAILED: + /* Restore and enable the counter */ + armpmu_start(event, PERF_EF_RELOAD); + break; + default: + break; + } + } +} + +static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd, + void *v) +{ + struct arm_pmu *armpmu = container_of(b, struct arm_pmu, cpu_pm_nb); + struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); + int enabled = bitmap_weight(hw_events->used_mask, armpmu->num_events); + + if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) + return NOTIFY_DONE; + + /* + * Always reset the PMU registers on power-up even if + * there are no events running. + */ + if (cmd == CPU_PM_EXIT && armpmu->reset) + armpmu->reset(armpmu); + + if (!enabled) + return NOTIFY_OK; + + switch (cmd) { + case CPU_PM_ENTER: + armpmu->stop(armpmu); + cpu_pm_pmu_setup(armpmu, cmd); + break; + case CPU_PM_EXIT: + cpu_pm_pmu_setup(armpmu, cmd); + case CPU_PM_ENTER_FAILED: + armpmu->start(armpmu); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu) +{ + cpu_pmu->cpu_pm_nb.notifier_call = cpu_pm_pmu_notify; + return cpu_pm_register_notifier(&cpu_pmu->cpu_pm_nb); +} + +static void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu) +{ + cpu_pm_unregister_notifier(&cpu_pmu->cpu_pm_nb); +} +#else +static inline int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu) { return 0; } +static inline void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu) { } +#endif + static int cpu_pmu_init(struct arm_pmu *cpu_pmu) { int err; @@ -733,6 +821,10 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu) if (err) goto out_hw_events; + err = cpu_pm_pmu_register(cpu_pmu); + if (err) + goto out_unregister; + for_each_possible_cpu(cpu) { struct pmu_hw_events *events = per_cpu_ptr(cpu_hw_events, cpu); raw_spin_lock_init(&events->pmu_lock); @@ -754,6 +846,8 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu) return 0; +out_unregister: + unregister_cpu_notifier(&cpu_pmu->hotplug_nb); out_hw_events: free_percpu(cpu_hw_events); return err; @@ -761,6 +855,7 @@ out_hw_events: static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu) { + cpu_pm_pmu_unregister(cpu_pmu); unregister_cpu_notifier(&cpu_pmu->hotplug_nb); free_percpu(cpu_pmu->hw_events); } diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index bfa673bb822d..0e55e4016f49 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -107,6 +107,7 @@ struct arm_pmu { struct platform_device *plat_device; struct pmu_hw_events __percpu *hw_events; struct notifier_block hotplug_nb; + struct notifier_block cpu_pm_nb; }; #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu)) From 25152dbcc1e5c0278670436e1f1e1f51320529c5 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Thu, 21 Apr 2016 10:24:34 +0100 Subject: [PATCH 0894/3220] UPSTREAM: drivers/perf: arm-pmu: fix RCU usage on pmu resume from low-power (cherry picked from cbcc72e037b8a3eb1fad3c1ae22021df21c97a51) Commit da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM notifier") added code in the arm perf infrastructure that allows the kernel to save/restore perf counters whenever the CPU enters a low-power state. The kernel saves/restores the counters for each active event through the armpmu_{stop/start} ARM pmu API, so that the low-power state enter/exit cycle is emulated through pmu start/stop operations for each event in use. However, calling armpmu_start() for each active event on power up executes code that requires RCU locking (perf_event_update_userpage()) to be functional, so, given that the core may call the CPU_PM notifiers while running the idle thread in an quiescent RCU state this is not allowed as detected through the following splat when kernel is run with CONFIG_PROVE_LOCKING enabled: [ 49.293286] [ 49.294761] =============================== [ 49.298895] [ INFO: suspicious RCU usage. ] [ 49.303031] 4.6.0-rc3+ #421 Not tainted [ 49.306821] ------------------------------- [ 49.310956] include/linux/rcupdate.h:872 rcu_read_lock() used illegally while idle! [ 49.318530] [ 49.318530] other info that might help us debug this: [ 49.318530] [ 49.326451] [ 49.326451] RCU used illegally from idle CPU! [ 49.326451] rcu_scheduler_active = 1, debug_locks = 0 [ 49.337209] RCU used illegally from extended quiescent state! [ 49.342892] 2 locks held by swapper/2/0: [ 49.346768] #0: (cpu_pm_notifier_lock){......}, at: [] cpu_pm_exit+0x18/0x80 [ 49.355492] #1: (rcu_read_lock){......}, at: [] perf_event_update_userpage+0x0/0x260 This patch wraps the armpmu_start() call (that indirectly calls perf_event_update_userpage()) on CPU_PM notifier power state exit (or failed entry) within the RCU_NONIDLE() macro so that the RCU subsystem is made aware the calling cpu is not idle from an RCU perspective for the armpmu_start() call duration, therefore fixing the issue. Change-Id: I6352b3e7970a2116d23e258fa085f0c29982f789 Fixes: da4e4f18afe0 ("drivers/perf: arm_pmu: implement CPU_PM notifier") Signed-off-by: Lorenzo Pieralisi Reported-by: James Morse Suggested-by: Kevin Hilman Cc: Ashwin Chaugule Cc: Kevin Hilman Cc: Sudeep Holla Cc: Daniel Lezcano Cc: Mathieu Poirier Acked-by: Mark Rutland Acked-by: Paul E. McKenney Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- drivers/perf/arm_pmu.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 85b445c50fee..64b07b2d47f8 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -745,8 +745,19 @@ static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd) break; case CPU_PM_EXIT: case CPU_PM_ENTER_FAILED: - /* Restore and enable the counter */ - armpmu_start(event, PERF_EF_RELOAD); + /* + * Restore and enable the counter. + * armpmu_start() indirectly calls + * + * perf_event_update_userpage() + * + * that requires RCU read locking to be functional, + * wrap the call within RCU_NONIDLE to make the + * RCU subsystem aware this cpu is not idle from + * an RCU perspective for the armpmu_start() call + * duration. + */ + RCU_NONIDLE(armpmu_start(event, PERF_EF_RELOAD)); break; default: break; From ebca043d15b000bc243540cb93109f818766fdf5 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Tue, 20 Jun 2017 09:35:33 -0700 Subject: [PATCH 0895/3220] UPSTREAM: selinux: enable genfscon labeling for tracefs In kernel version 4.1, tracefs was separated from debugfs into its own filesystem. Prior to this split, files in /sys/kernel/debug/tracing could be labeled during filesystem creation using genfscon or later from userspace using setxattr. This change re-enables support for genfscon labeling. Signed-off-by: Jeff Vander Stoep Acked-by: Stephen Smalley Signed-off-by: Paul Moore (cherry picked from commit 6a3911837da0a90ed599fd0a9836472f5e7ddf1b) Change-Id: I98ad8c829302346705c1abcdc8f019f479fdefb6 Bug: 62413700 --- security/selinux/hooks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 2bc8b555bca9..34427384605d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -740,6 +740,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, sbsec->flags |= SE_SBPROC | SE_SBGENFS; if (!strcmp(sb->s_type->name, "debugfs") || + !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "pstore")) sbsec->flags |= SE_SBGENFS; From 92cc35433ddd0cba7a03fc19e88f8c854ee02055 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 9 May 2017 12:30:56 +0800 Subject: [PATCH 0896/3220] ANDROID: sdcardfs: use mount_nodev and fix a issue in sdcardfs_kill_sb Use the VFS mount_nodev instead of customized mount_nodev_with_options and fix generic_shutdown_super to kill_anon_super because of set_anon_super Signed-off-by: Gao Xiang Change-Id: Ibe46647aa2ce49d79291aa9d0295e9625cfccd80 --- fs/sdcardfs/main.c | 47 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 3c5b51d49d21..80825b287836 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -364,41 +364,34 @@ out: return err; } -/* A feature which supports mount_nodev() with options */ -static struct dentry *mount_nodev_with_options(struct vfsmount *mnt, - struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, - int (*fill_super)(struct vfsmount *, struct super_block *, - const char *, void *, int)) +struct sdcardfs_mount_private { + struct vfsmount *mnt; + const char *dev_name; + void *raw_data; +}; +static int __sdcardfs_fill_super( + struct super_block *sb, + void *_priv, int silent) { - int error; - struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); + struct sdcardfs_mount_private *priv = _priv; - if (IS_ERR(s)) - return ERR_CAST(s); - - s->s_flags = flags; - - error = fill_super(mnt, s, dev_name, data, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= MS_ACTIVE; - return dget(s->s_root); + return sdcardfs_read_super(priv->mnt, + sb, priv->dev_name, priv->raw_data, silent); } static struct dentry *sdcardfs_mount(struct vfsmount *mnt, struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { - /* - * dev_name is a lower_path_name, - * raw_data is a option string. - */ - return mount_nodev_with_options(mnt, fs_type, flags, dev_name, - raw_data, sdcardfs_read_super); + struct sdcardfs_mount_private priv = { + .mnt = mnt, + .dev_name = dev_name, + .raw_data = raw_data + }; + + return mount_nodev(fs_type, flags, + &priv, __sdcardfs_fill_super); } static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type, @@ -423,7 +416,7 @@ void sdcardfs_kill_sb(struct super_block *sb) list_del(&sbi->list); mutex_unlock(&sdcardfs_super_list_lock); } - generic_shutdown_super(sb); + kill_anon_super(sb); } static struct file_system_type sdcardfs_fs_type = { From 97841e574afb85dca93016347efee005d8150efc Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Tue, 11 Jul 2017 10:06:37 -0700 Subject: [PATCH 0897/3220] ANDROID: android-base.cfg: remove CONFIG_CGROUP_DEBUG This config option is not required by Android. Bug: 63578267 Change-Id: I163fa19183734a1a343d525e885a000a495c242e Signed-off-by: Steve Muckle --- android/configs/android-base.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index d675e712fe8c..28dce6c0516d 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -17,7 +17,6 @@ CONFIG_AUDIT=y CONFIG_BLK_DEV_INITRD=y CONFIG_CGROUPS=y CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_DEFAULT_SECURITY_SELINUX=y From d368c6faa19ba98d26e713d3bf258cc6466d8ac2 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Thu, 1 Jun 2017 10:59:12 -0700 Subject: [PATCH 0898/3220] sched: walt: fix window misalignment when HZ=300 Due to rounding error hrtimer tick interval becomes 3333333 ns when HZ=300. Consequently the tick time stamp nearest to the WALT's default window size 20ms will be also 19999998 (3333333 * 6). Change-Id: I08f9bd2dbecccbb683e4490d06d8b0da703d3ab2 Suggested-by: Joel Fernandes Signed-off-by: Joonwoo Park --- kernel/sched/walt.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 6e053bd9830c..92c3aae8e056 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -72,7 +72,15 @@ static cpumask_t mpc_mask = CPU_MASK_ALL; __read_mostly unsigned int walt_ravg_window = 20000000; /* Min window size (in ns) = 10ms */ +#ifdef CONFIG_HZ_300 +/* + * Tick interval becomes to 3333333 due to + * rounding error when HZ=300. + */ +#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) +#else #define MIN_SCHED_RAVG_WINDOW 10000000 +#endif /* Max window size (in ns) = 1s */ #define MAX_SCHED_RAVG_WINDOW 1000000000 From ec49bb00cd72c190005a850a5832d2cab9aeb5ae Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 24 Mar 2017 15:53:53 -0700 Subject: [PATCH 0899/3220] Revert "android: binder: move global binder state into context struct." This reverts commit d6bbb3276728e4a13c832c55f948a40a27bc4a33. Signed-off-by: Todd Kjos Change-Id: Ib507d62803f2beba7178c3f6f3f78bd1095b25b8 --- drivers/android/binder.c | 392 +++++++++++++-------------------------- 1 file changed, 133 insertions(+), 259 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 9cf4f9bbc711..08cde76875d2 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -18,7 +18,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include #include #include #include @@ -47,11 +46,19 @@ #include #include "binder_trace.h" +static DEFINE_MUTEX(binder_main_lock); +static DEFINE_MUTEX(binder_deferred_lock); +static DEFINE_MUTEX(binder_mmap_lock); + static HLIST_HEAD(binder_devices); +static HLIST_HEAD(binder_procs); +static HLIST_HEAD(binder_deferred_list); +static HLIST_HEAD(binder_dead_nodes); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; -atomic_t binder_last_id; +static int binder_last_id; +static struct workqueue_struct *binder_deferred_workqueue; #define BINDER_DEBUG_ENTRY(name) \ static int binder_##name##_open(struct inode *inode, struct file *file) \ @@ -166,24 +173,20 @@ enum binder_stat_types { struct binder_stats { int br[_IOC_NR(BR_FAILED_REPLY) + 1]; int bc[_IOC_NR(BC_REPLY_SG) + 1]; + int obj_created[BINDER_STAT_COUNT]; + int obj_deleted[BINDER_STAT_COUNT]; }; -/* These are still global, since it's not always easy to get the context */ -struct binder_obj_stats { - atomic_t obj_created[BINDER_STAT_COUNT]; - atomic_t obj_deleted[BINDER_STAT_COUNT]; -}; - -static struct binder_obj_stats binder_obj_stats; +static struct binder_stats binder_stats; static inline void binder_stats_deleted(enum binder_stat_types type) { - atomic_inc(&binder_obj_stats.obj_deleted[type]); + binder_stats.obj_deleted[type]++; } static inline void binder_stats_created(enum binder_stat_types type) { - atomic_inc(&binder_obj_stats.obj_created[type]); + binder_stats.obj_created[type]++; } struct binder_transaction_log_entry { @@ -204,6 +207,8 @@ struct binder_transaction_log { int full; struct binder_transaction_log_entry entry[32]; }; +static struct binder_transaction_log binder_transaction_log; +static struct binder_transaction_log binder_transaction_log_failed; static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_transaction_log *log) @@ -224,21 +229,6 @@ struct binder_context { struct binder_node *binder_context_mgr_node; kuid_t binder_context_mgr_uid; const char *name; - - struct mutex binder_main_lock; - struct mutex binder_deferred_lock; - struct mutex binder_mmap_lock; - - struct hlist_head binder_procs; - struct hlist_head binder_dead_nodes; - struct hlist_head binder_deferred_list; - - struct work_struct deferred_work; - struct workqueue_struct *binder_deferred_workqueue; - struct binder_transaction_log transaction_log; - struct binder_transaction_log transaction_log_failed; - - struct binder_stats binder_stats; }; struct binder_device { @@ -461,18 +451,17 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd) return retval; } -static inline void binder_lock(struct binder_context *context, const char *tag) +static inline void binder_lock(const char *tag) { trace_binder_lock(tag); - mutex_lock(&context->binder_main_lock); + mutex_lock(&binder_main_lock); trace_binder_locked(tag); } -static inline void binder_unlock(struct binder_context *context, - const char *tag) +static inline void binder_unlock(const char *tag) { trace_binder_unlock(tag); - mutex_unlock(&context->binder_main_lock); + mutex_unlock(&binder_main_lock); } static void binder_set_nice(long nice) @@ -957,7 +946,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, binder_stats_created(BINDER_STAT_NODE); rb_link_node(&node->rb_node, parent, p); rb_insert_color(&node->rb_node, &proc->nodes); - node->debug_id = atomic_inc_return(&binder_last_id); + node->debug_id = ++binder_last_id; node->proc = proc; node->ptr = ptr; node->cookie = cookie; @@ -1099,7 +1088,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, if (new_ref == NULL) return NULL; binder_stats_created(BINDER_STAT_REF); - new_ref->debug_id = atomic_inc_return(&binder_last_id); + new_ref->debug_id = ++binder_last_id; new_ref->proc = proc; new_ref->node = node; rb_link_node(&new_ref->rb_node_node, parent, p); @@ -1859,7 +1848,7 @@ static void binder_transaction(struct binder_proc *proc, binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; - e = binder_transaction_log_add(&context->transaction_log); + e = binder_transaction_log_add(&binder_transaction_log); e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); e->from_proc = proc->pid; e->from_thread = thread->pid; @@ -1981,7 +1970,7 @@ static void binder_transaction(struct binder_proc *proc, } binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); - t->debug_id = atomic_inc_return(&binder_last_id); + t->debug_id = ++binder_last_id; e->debug_id = t->debug_id; if (reply) @@ -2245,8 +2234,7 @@ err_no_context_mgr_node: { struct binder_transaction_log_entry *fe; - fe = binder_transaction_log_add( - &context->transaction_log_failed); + fe = binder_transaction_log_add(&binder_transaction_log_failed); *fe = *e; } @@ -2274,8 +2262,8 @@ static int binder_thread_write(struct binder_proc *proc, return -EFAULT; ptr += sizeof(uint32_t); trace_binder_command(cmd); - if (_IOC_NR(cmd) < ARRAY_SIZE(context->binder_stats.bc)) { - context->binder_stats.bc[_IOC_NR(cmd)]++; + if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) { + binder_stats.bc[_IOC_NR(cmd)]++; proc->stats.bc[_IOC_NR(cmd)]++; thread->stats.bc[_IOC_NR(cmd)]++; } @@ -2640,8 +2628,8 @@ static void binder_stat_br(struct binder_proc *proc, struct binder_thread *thread, uint32_t cmd) { trace_binder_return(cmd); - if (_IOC_NR(cmd) < ARRAY_SIZE(proc->stats.br)) { - proc->context->binder_stats.br[_IOC_NR(cmd)]++; + if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) { + binder_stats.br[_IOC_NR(cmd)]++; proc->stats.br[_IOC_NR(cmd)]++; thread->stats.br[_IOC_NR(cmd)]++; } @@ -2705,7 +2693,7 @@ retry: if (wait_for_proc_work) proc->ready_threads++; - binder_unlock(proc->context, __func__); + binder_unlock(__func__); trace_binder_wait_for_work(wait_for_proc_work, !!thread->transaction_stack, @@ -2732,7 +2720,7 @@ retry: ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread)); } - binder_lock(proc->context, __func__); + binder_lock(__func__); if (wait_for_proc_work) proc->ready_threads--; @@ -3119,14 +3107,14 @@ static unsigned int binder_poll(struct file *filp, struct binder_thread *thread = NULL; int wait_for_proc_work; - binder_lock(proc->context, __func__); + binder_lock(__func__); thread = binder_get_thread(proc); wait_for_proc_work = thread->transaction_stack == NULL && list_empty(&thread->todo) && thread->return_error == BR_OK; - binder_unlock(proc->context, __func__); + binder_unlock(__func__); if (wait_for_proc_work) { if (binder_has_proc_work(proc, thread)) @@ -3253,7 +3241,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; struct binder_proc *proc = filp->private_data; - struct binder_context *context = proc->context; struct binder_thread *thread; unsigned int size = _IOC_SIZE(cmd); void __user *ubuf = (void __user *)arg; @@ -3267,7 +3254,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (ret) goto err_unlocked; - binder_lock(context, __func__); + binder_lock(__func__); thread = binder_get_thread(proc); if (thread == NULL) { ret = -ENOMEM; @@ -3319,7 +3306,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err: if (thread) thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN; - binder_unlock(context, __func__); + binder_unlock(__func__); wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret && ret != -ERESTARTSYS) pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); @@ -3391,7 +3378,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) } vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE; - mutex_lock(&proc->context->binder_mmap_lock); + mutex_lock(&binder_mmap_lock); if (proc->buffer) { ret = -EBUSY; failure_string = "already mapped"; @@ -3406,7 +3393,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) } proc->buffer = area->addr; proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer; - mutex_unlock(&proc->context->binder_mmap_lock); + mutex_unlock(&binder_mmap_lock); #ifdef CONFIG_CPU_CACHE_VIPT if (cache_is_vipt_aliasing()) { @@ -3451,12 +3438,12 @@ err_alloc_small_buf_failed: kfree(proc->pages); proc->pages = NULL; err_alloc_pages_failed: - mutex_lock(&proc->context->binder_mmap_lock); + mutex_lock(&binder_mmap_lock); vfree(proc->buffer); proc->buffer = NULL; err_get_vm_area_failed: err_already_mapped: - mutex_unlock(&proc->context->binder_mmap_lock); + mutex_unlock(&binder_mmap_lock); err_bad_arg: pr_err("binder_mmap: %d %lx-%lx %s failed %d\n", proc->pid, vma->vm_start, vma->vm_end, failure_string, ret); @@ -3483,15 +3470,15 @@ static int binder_open(struct inode *nodp, struct file *filp) miscdev); proc->context = &binder_dev->context; - binder_lock(proc->context, __func__); + binder_lock(__func__); binder_stats_created(BINDER_STAT_PROC); - hlist_add_head(&proc->proc_node, &proc->context->binder_procs); + hlist_add_head(&proc->proc_node, &binder_procs); proc->pid = current->group_leader->pid; INIT_LIST_HEAD(&proc->delivered_death); filp->private_data = proc; - binder_unlock(proc->context, __func__); + binder_unlock(__func__); if (binder_debugfs_dir_entry_proc) { char strbuf[11]; @@ -3556,7 +3543,6 @@ static int binder_release(struct inode *nodp, struct file *filp) static int binder_node_release(struct binder_node *node, int refs) { struct binder_ref *ref; - struct binder_context *context = node->proc->context; int death = 0; list_del_init(&node->work.entry); @@ -3572,7 +3558,7 @@ static int binder_node_release(struct binder_node *node, int refs) node->proc = NULL; node->local_strong_refs = 0; node->local_weak_refs = 0; - hlist_add_head(&node->dead_node, &context->binder_dead_nodes); + hlist_add_head(&node->dead_node, &binder_dead_nodes); hlist_for_each_entry(ref, &node->refs, node_entry) { refs++; @@ -3637,8 +3623,7 @@ static void binder_deferred_release(struct binder_proc *proc) node = rb_entry(n, struct binder_node, rb_node); nodes++; rb_erase(&node->rb_node, &proc->nodes); - incoming_refs = binder_node_release(node, - incoming_refs); + incoming_refs = binder_node_release(node, incoming_refs); } outgoing_refs = 0; @@ -3710,16 +3695,14 @@ static void binder_deferred_func(struct work_struct *work) { struct binder_proc *proc; struct files_struct *files; - struct binder_context *context = - container_of(work, struct binder_context, deferred_work); int defer; do { - binder_lock(context, __func__); - mutex_lock(&context->binder_deferred_lock); - if (!hlist_empty(&context->binder_deferred_list)) { - proc = hlist_entry(context->binder_deferred_list.first, + binder_lock(__func__); + mutex_lock(&binder_deferred_lock); + if (!hlist_empty(&binder_deferred_list)) { + proc = hlist_entry(binder_deferred_list.first, struct binder_proc, deferred_work_node); hlist_del_init(&proc->deferred_work_node); defer = proc->deferred_work; @@ -3728,7 +3711,7 @@ static void binder_deferred_func(struct work_struct *work) proc = NULL; defer = 0; } - mutex_unlock(&context->binder_deferred_lock); + mutex_unlock(&binder_deferred_lock); files = NULL; if (defer & BINDER_DEFERRED_PUT_FILES) { @@ -3743,24 +3726,24 @@ static void binder_deferred_func(struct work_struct *work) if (defer & BINDER_DEFERRED_RELEASE) binder_deferred_release(proc); /* frees proc */ - binder_unlock(context, __func__); + binder_unlock(__func__); if (files) put_files_struct(files); } while (proc); } +static DECLARE_WORK(binder_deferred_work, binder_deferred_func); static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer) { - mutex_lock(&proc->context->binder_deferred_lock); + mutex_lock(&binder_deferred_lock); proc->deferred_work |= defer; if (hlist_unhashed(&proc->deferred_work_node)) { hlist_add_head(&proc->deferred_work_node, - &proc->context->binder_deferred_list); - queue_work(proc->context->binder_deferred_workqueue, - &proc->context->deferred_work); + &binder_deferred_list); + queue_work(binder_deferred_workqueue, &binder_deferred_work); } - mutex_unlock(&proc->context->binder_deferred_lock); + mutex_unlock(&binder_deferred_lock); } static void print_binder_transaction(struct seq_file *m, const char *prefix, @@ -3991,20 +3974,8 @@ static const char * const binder_objstat_strings[] = { "transaction_complete" }; -static void add_binder_stats(struct binder_stats *from, struct binder_stats *to) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(to->bc); i++) - to->bc[i] += from->bc[i]; - - for (i = 0; i < ARRAY_SIZE(to->br); i++) - to->br[i] += from->br[i]; -} - static void print_binder_stats(struct seq_file *m, const char *prefix, - struct binder_stats *stats, - struct binder_obj_stats *obj_stats) + struct binder_stats *stats) { int i; @@ -4024,21 +3995,16 @@ static void print_binder_stats(struct seq_file *m, const char *prefix, binder_return_strings[i], stats->br[i]); } - if (!obj_stats) - return; - - BUILD_BUG_ON(ARRAY_SIZE(obj_stats->obj_created) != + BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != ARRAY_SIZE(binder_objstat_strings)); - BUILD_BUG_ON(ARRAY_SIZE(obj_stats->obj_created) != - ARRAY_SIZE(obj_stats->obj_deleted)); - for (i = 0; i < ARRAY_SIZE(obj_stats->obj_created); i++) { - int obj_created = atomic_read(&obj_stats->obj_created[i]); - int obj_deleted = atomic_read(&obj_stats->obj_deleted[i]); - - if (obj_created || obj_deleted) + BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != + ARRAY_SIZE(stats->obj_deleted)); + for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) { + if (stats->obj_created[i] || stats->obj_deleted[i]) seq_printf(m, "%s%s: active %d total %d\n", prefix, - binder_objstat_strings[i], - obj_created - obj_deleted, obj_created); + binder_objstat_strings[i], + stats->obj_created[i] - stats->obj_deleted[i], + stats->obj_created[i]); } } @@ -4093,131 +4059,85 @@ static void print_binder_proc_stats(struct seq_file *m, } seq_printf(m, " pending transactions: %d\n", count); - print_binder_stats(m, " ", &proc->stats, NULL); + print_binder_stats(m, " ", &proc->stats); } static int binder_state_show(struct seq_file *m, void *unused) { - struct binder_device *device; - struct binder_context *context; struct binder_proc *proc; struct binder_node *node; int do_lock = !binder_debug_no_lock; - bool wrote_dead_nodes_header = false; + + if (do_lock) + binder_lock(__func__); seq_puts(m, "binder state:\n"); - hlist_for_each_entry(device, &binder_devices, hlist) { - context = &device->context; - if (do_lock) - binder_lock(context, __func__); - if (!wrote_dead_nodes_header && - !hlist_empty(&context->binder_dead_nodes)) { - seq_puts(m, "dead nodes:\n"); - wrote_dead_nodes_header = true; - } - hlist_for_each_entry(node, &context->binder_dead_nodes, - dead_node) - print_binder_node(m, node); + if (!hlist_empty(&binder_dead_nodes)) + seq_puts(m, "dead nodes:\n"); + hlist_for_each_entry(node, &binder_dead_nodes, dead_node) + print_binder_node(m, node); - if (do_lock) - binder_unlock(context, __func__); - } - - hlist_for_each_entry(device, &binder_devices, hlist) { - context = &device->context; - if (do_lock) - binder_lock(context, __func__); - - hlist_for_each_entry(proc, &context->binder_procs, proc_node) - print_binder_proc(m, proc, 1); - if (do_lock) - binder_unlock(context, __func__); - } + hlist_for_each_entry(proc, &binder_procs, proc_node) + print_binder_proc(m, proc, 1); + if (do_lock) + binder_unlock(__func__); return 0; } static int binder_stats_show(struct seq_file *m, void *unused) { - struct binder_device *device; - struct binder_context *context; struct binder_proc *proc; - struct binder_stats total_binder_stats; int do_lock = !binder_debug_no_lock; - memset(&total_binder_stats, 0, sizeof(struct binder_stats)); - - hlist_for_each_entry(device, &binder_devices, hlist) { - context = &device->context; - if (do_lock) - binder_lock(context, __func__); - - add_binder_stats(&context->binder_stats, &total_binder_stats); - - if (do_lock) - binder_unlock(context, __func__); - } + if (do_lock) + binder_lock(__func__); seq_puts(m, "binder stats:\n"); - print_binder_stats(m, "", &total_binder_stats, &binder_obj_stats); - hlist_for_each_entry(device, &binder_devices, hlist) { - context = &device->context; - if (do_lock) - binder_lock(context, __func__); + print_binder_stats(m, "", &binder_stats); - hlist_for_each_entry(proc, &context->binder_procs, proc_node) - print_binder_proc_stats(m, proc); - if (do_lock) - binder_unlock(context, __func__); - } + hlist_for_each_entry(proc, &binder_procs, proc_node) + print_binder_proc_stats(m, proc); + if (do_lock) + binder_unlock(__func__); return 0; } static int binder_transactions_show(struct seq_file *m, void *unused) { - struct binder_device *device; - struct binder_context *context; struct binder_proc *proc; int do_lock = !binder_debug_no_lock; - seq_puts(m, "binder transactions:\n"); - hlist_for_each_entry(device, &binder_devices, hlist) { - context = &device->context; - if (do_lock) - binder_lock(context, __func__); + if (do_lock) + binder_lock(__func__); - hlist_for_each_entry(proc, &context->binder_procs, proc_node) - print_binder_proc(m, proc, 0); - if (do_lock) - binder_unlock(context, __func__); - } + seq_puts(m, "binder transactions:\n"); + hlist_for_each_entry(proc, &binder_procs, proc_node) + print_binder_proc(m, proc, 0); + if (do_lock) + binder_unlock(__func__); return 0; } static int binder_proc_show(struct seq_file *m, void *unused) { - struct binder_device *device; - struct binder_context *context; struct binder_proc *itr; int pid = (unsigned long)m->private; int do_lock = !binder_debug_no_lock; - hlist_for_each_entry(device, &binder_devices, hlist) { - context = &device->context; - if (do_lock) - binder_lock(context, __func__); + if (do_lock) + binder_lock(__func__); - hlist_for_each_entry(itr, &context->binder_procs, proc_node) { - if (itr->pid == pid) { - seq_puts(m, "binder proc state:\n"); - print_binder_proc(m, itr, 1); - } + hlist_for_each_entry(itr, &binder_procs, proc_node) { + if (itr->pid == pid) { + seq_puts(m, "binder proc state:\n"); + print_binder_proc(m, itr, 1); } - if (do_lock) - binder_unlock(context, __func__); } + if (do_lock) + binder_unlock(__func__); return 0; } @@ -4232,10 +4152,11 @@ static void print_binder_transaction_log_entry(struct seq_file *m, e->to_node, e->target_handle, e->data_size, e->offsets_size); } -static int print_binder_transaction_log(struct seq_file *m, - struct binder_transaction_log *log) +static int binder_transaction_log_show(struct seq_file *m, void *unused) { + struct binder_transaction_log *log = m->private; int i; + if (log->full) { for (i = log->next; i < ARRAY_SIZE(log->entry); i++) print_binder_transaction_log_entry(m, &log->entry[i]); @@ -4245,31 +4166,6 @@ static int print_binder_transaction_log(struct seq_file *m, return 0; } -static int binder_transaction_log_show(struct seq_file *m, void *unused) -{ - struct binder_device *device; - struct binder_context *context; - - hlist_for_each_entry(device, &binder_devices, hlist) { - context = &device->context; - print_binder_transaction_log(m, &context->transaction_log); - } - return 0; -} - -static int binder_failed_transaction_log_show(struct seq_file *m, void *unused) -{ - struct binder_device *device; - struct binder_context *context; - - hlist_for_each_entry(device, &binder_devices, hlist) { - context = &device->context; - print_binder_transaction_log(m, - &context->transaction_log_failed); - } - return 0; -} - static const struct file_operations binder_fops = { .owner = THIS_MODULE, .poll = binder_poll, @@ -4285,20 +4181,11 @@ BINDER_DEBUG_ENTRY(state); BINDER_DEBUG_ENTRY(stats); BINDER_DEBUG_ENTRY(transactions); BINDER_DEBUG_ENTRY(transaction_log); -BINDER_DEBUG_ENTRY(failed_transaction_log); - -static void __init free_binder_device(struct binder_device *device) -{ - if (device->context.binder_deferred_workqueue) - destroy_workqueue(device->context.binder_deferred_workqueue); - kfree(device); -} static int __init init_binder_device(const char *name) { int ret; struct binder_device *binder_device; - struct binder_context *context; binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL); if (!binder_device) @@ -4308,65 +4195,31 @@ static int __init init_binder_device(const char *name) binder_device->miscdev.minor = MISC_DYNAMIC_MINOR; binder_device->miscdev.name = name; - context = &binder_device->context; - context->binder_context_mgr_uid = INVALID_UID; - context->name = name; - - mutex_init(&context->binder_main_lock); - mutex_init(&context->binder_deferred_lock); - mutex_init(&context->binder_mmap_lock); - - context->binder_deferred_workqueue = - create_singlethread_workqueue(name); - - if (!context->binder_deferred_workqueue) { - ret = -ENOMEM; - goto err_create_singlethread_workqueue_failed; - } - - INIT_HLIST_HEAD(&context->binder_procs); - INIT_HLIST_HEAD(&context->binder_dead_nodes); - INIT_HLIST_HEAD(&context->binder_deferred_list); - INIT_WORK(&context->deferred_work, binder_deferred_func); + binder_device->context.binder_context_mgr_uid = INVALID_UID; + binder_device->context.name = name; ret = misc_register(&binder_device->miscdev); if (ret < 0) { - goto err_misc_register_failed; + kfree(binder_device); + return ret; } hlist_add_head(&binder_device->hlist, &binder_devices); - return ret; - -err_create_singlethread_workqueue_failed: -err_misc_register_failed: - free_binder_device(binder_device); return ret; } static int __init binder_init(void) { - int ret = 0; + int ret; char *device_name, *device_names; struct binder_device *device; struct hlist_node *tmp; - /* - * Copy the module_parameter string, because we don't want to - * tokenize it in-place. - */ - device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL); - if (!device_names) + binder_deferred_workqueue = create_singlethread_workqueue("binder"); + if (!binder_deferred_workqueue) return -ENOMEM; - strcpy(device_names, binder_devices_param); - - while ((device_name = strsep(&device_names, ","))) { - ret = init_binder_device(device_name); - if (ret) - goto err_init_binder_device_failed; - } - binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL); if (binder_debugfs_dir_entry_root) binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", @@ -4391,13 +4244,30 @@ static int __init binder_init(void) debugfs_create_file("transaction_log", S_IRUGO, binder_debugfs_dir_entry_root, - NULL, + &binder_transaction_log, &binder_transaction_log_fops); debugfs_create_file("failed_transaction_log", S_IRUGO, binder_debugfs_dir_entry_root, - NULL, - &binder_failed_transaction_log_fops); + &binder_transaction_log_failed, + &binder_transaction_log_fops); + } + + /* + * Copy the module_parameter string, because we don't want to + * tokenize it in-place. + */ + device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL); + if (!device_names) { + ret = -ENOMEM; + goto err_alloc_device_names_failed; + } + strcpy(device_names, binder_devices_param); + + while ((device_name = strsep(&device_names, ","))) { + ret = init_binder_device(device_name); + if (ret) + goto err_init_binder_device_failed; } return ret; @@ -4406,8 +4276,12 @@ err_init_binder_device_failed: hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) { misc_deregister(&device->miscdev); hlist_del(&device->hlist); - free_binder_device(device); + kfree(device); } +err_alloc_device_names_failed: + debugfs_remove_recursive(binder_debugfs_dir_entry_root); + + destroy_workqueue(binder_deferred_workqueue); return ret; } From 0cebb407b2f431d748d9cc85cf7e4232a9223342 Mon Sep 17 00:00:00 2001 From: Riley Andrews Date: Tue, 1 Sep 2015 12:42:07 -0700 Subject: [PATCH 0900/3220] FROMLIST: binder: Use wake up hint for synchronous transactions. (from https://patchwork.kernel.org/patch/9817747) Use wake_up_interruptible_sync() to hint to the scheduler binder transactions are synchronous wakeups. Disable preemption while waking to avoid ping-ponging on the binder lock. Change-Id: Ic406a232d0873662f80148e37acefe5243d912a0 Signed-off-by: Todd Kjos Git-commit: 443c026e90820170aa3db2c21d2933ae5922f900 Git-repo: https://android.googlesource.com/kernel/msm Signed-off-by: Omprakash Dhyade --- drivers/android/binder.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 08cde76875d2..211262071198 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2201,8 +2201,12 @@ static void binder_transaction(struct binder_proc *proc, list_add_tail(&t->work.entry, target_list); tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; list_add_tail(&tcomplete->entry, &thread->todo); - if (target_wait) - wake_up_interruptible(target_wait); + if (target_wait) { + if (reply || !(t->flags & TF_ONE_WAY)) + wake_up_interruptible_sync(target_wait); + else + wake_up_interruptible(target_wait); + } return; err_translate_failed: From 19a3948b3c02642a29df6339212a40424e24a570 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 10 Oct 2016 10:39:59 -0700 Subject: [PATCH 0901/3220] FROMLIST: binder: separate binder allocator structure from binder proc (from https://patchwork.kernel.org/patch/9817745/) The binder allocator is logically separate from the rest of the binder drivers. Separating the data structures to prepare for splitting into separate file with separate locking. Change-Id: I5ca4df567ac4b8c8d6ee2116ae67c0c1d75c9747 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 216 ++++++++++++++++++++------------- drivers/android/binder_trace.h | 2 +- 2 files changed, 132 insertions(+), 86 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 211262071198..734edbd67410 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -319,6 +319,41 @@ enum binder_deferred_state { BINDER_DEFERRED_RELEASE = 0x04, }; +/** + * struct binder_alloc - per-binder proc state for binder allocator + * @vma: vm_area_struct passed to mmap_handler + * (invarient after mmap) + * @vma_vm_mm: copy of vma->vm_mm (invarient after mmap) + * @buffer: base of per-proc address space mapped via mmap + * @user_buffer_offset: offset between user and kernel VAs for buffer + * @buffers: list of all buffers for this proc + * @free_buffers: rb tree of buffers available for allocation + * sorted by size + * @allocated_buffers: rb tree of allocated buffers sorted by address + * @free_async_space: VA space available for async buffers. This is + * initialized at mmap time to 1/2 the full VA space + * @pages: array of physical page addresses for each page of + * mmap'd space + * @buffer_size: size of address space (could be less than requested) + * + * Bookkeeping structure for per-proc address space management for binder + * buffers. It is normally initialized during binder_init() and binder_mmap() + * calls. The address space is used for both user-visible buffers and for + * struct binder_buffer objects used to track the user buffers + */ +struct binder_alloc { + struct vm_area_struct *vma; + struct mm_struct *vma_vm_mm; + void *buffer; + ptrdiff_t user_buffer_offset; + struct list_head buffers; + struct rb_root free_buffers; + struct rb_root allocated_buffers; + size_t free_async_space; + struct page **pages; + size_t buffer_size; +}; + struct binder_proc { struct hlist_node proc_node; struct rb_root threads; @@ -326,23 +361,11 @@ struct binder_proc { struct rb_root refs_by_desc; struct rb_root refs_by_node; int pid; - struct vm_area_struct *vma; - struct mm_struct *vma_vm_mm; struct task_struct *tsk; struct files_struct *files; struct hlist_node deferred_work_node; int deferred_work; - void *buffer; - ptrdiff_t user_buffer_offset; - struct list_head buffers; - struct rb_root free_buffers; - struct rb_root allocated_buffers; - size_t free_async_space; - - struct page **pages; - size_t buffer_size; - uint32_t buffer_free; struct list_head todo; wait_queue_head_t wait; struct binder_stats stats; @@ -353,6 +376,7 @@ struct binder_proc { int ready_threads; long default_priority; struct dentry *debugfs_entry; + struct binder_alloc alloc; struct binder_context *context; }; @@ -485,8 +509,10 @@ static void binder_set_nice(long nice) static size_t binder_buffer_size(struct binder_proc *proc, struct binder_buffer *buffer) { - if (list_is_last(&buffer->entry, &proc->buffers)) - return proc->buffer + proc->buffer_size - (void *)buffer->data; + if (list_is_last(&buffer->entry, &proc->alloc.buffers)) + return proc->alloc.buffer + + proc->alloc.buffer_size - + (void *)buffer->data; return (size_t)list_entry(buffer->entry.next, struct binder_buffer, entry) - (size_t)buffer->data; } @@ -494,7 +520,7 @@ static size_t binder_buffer_size(struct binder_proc *proc, static void binder_insert_free_buffer(struct binder_proc *proc, struct binder_buffer *new_buffer) { - struct rb_node **p = &proc->free_buffers.rb_node; + struct rb_node **p = &proc->alloc.free_buffers.rb_node; struct rb_node *parent = NULL; struct binder_buffer *buffer; size_t buffer_size; @@ -521,13 +547,13 @@ static void binder_insert_free_buffer(struct binder_proc *proc, p = &parent->rb_right; } rb_link_node(&new_buffer->rb_node, parent, p); - rb_insert_color(&new_buffer->rb_node, &proc->free_buffers); + rb_insert_color(&new_buffer->rb_node, &proc->alloc.free_buffers); } static void binder_insert_allocated_buffer(struct binder_proc *proc, struct binder_buffer *new_buffer) { - struct rb_node **p = &proc->allocated_buffers.rb_node; + struct rb_node **p = &proc->alloc.allocated_buffers.rb_node; struct rb_node *parent = NULL; struct binder_buffer *buffer; @@ -546,18 +572,19 @@ static void binder_insert_allocated_buffer(struct binder_proc *proc, BUG(); } rb_link_node(&new_buffer->rb_node, parent, p); - rb_insert_color(&new_buffer->rb_node, &proc->allocated_buffers); + rb_insert_color(&new_buffer->rb_node, &proc->alloc.allocated_buffers); } static struct binder_buffer *binder_buffer_lookup(struct binder_proc *proc, uintptr_t user_ptr) { - struct rb_node *n = proc->allocated_buffers.rb_node; + struct rb_node *n = proc->alloc.allocated_buffers.rb_node; struct binder_buffer *buffer; struct binder_buffer *kern_ptr; - kern_ptr = (struct binder_buffer *)(user_ptr - proc->user_buffer_offset - - offsetof(struct binder_buffer, data)); + kern_ptr = (struct binder_buffer *) + (user_ptr - proc->alloc.user_buffer_offset - + offsetof(struct binder_buffer, data)); while (n) { buffer = rb_entry(n, struct binder_buffer, rb_node); @@ -598,8 +625,8 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, if (mm) { down_write(&mm->mmap_sem); - vma = proc->vma; - if (vma && mm != proc->vma_vm_mm) { + vma = proc->alloc.vma; + if (vma && mm != proc->alloc.vma_vm_mm) { pr_err("%d: vma mm and task mm mismatch\n", proc->pid); vma = NULL; @@ -618,7 +645,8 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; - page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; + page = &proc->alloc.pages[ + (page_addr - proc->alloc.buffer) / PAGE_SIZE]; BUG_ON(*page); *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); @@ -637,7 +665,7 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, goto err_map_kernel_failed; } user_page_addr = - (uintptr_t)page_addr + proc->user_buffer_offset; + (uintptr_t)page_addr + proc->alloc.user_buffer_offset; ret = vm_insert_page(vma, user_page_addr, page[0]); if (ret) { pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n", @@ -655,10 +683,14 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, free_range: for (page_addr = end - PAGE_SIZE; page_addr >= start; page_addr -= PAGE_SIZE) { - page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; + + page = &proc->alloc.pages[ + (page_addr - proc->alloc.buffer) / PAGE_SIZE]; if (vma) - zap_page_range(vma, (uintptr_t)page_addr + - proc->user_buffer_offset, PAGE_SIZE, NULL); + zap_page_range(vma, + (uintptr_t)page_addr + + proc->alloc.user_buffer_offset, + PAGE_SIZE, NULL); err_vm_insert_page_failed: unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); err_map_kernel_failed: @@ -681,7 +713,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, size_t extra_buffers_size, int is_async) { - struct rb_node *n = proc->free_buffers.rb_node; + struct rb_node *n = proc->alloc.free_buffers.rb_node; struct binder_buffer *buffer; size_t buffer_size; struct rb_node *best_fit = NULL; @@ -689,7 +721,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, void *end_page_addr; size_t size, data_offsets_size; - if (proc->vma == NULL) { + if (proc->alloc.vma == NULL) { pr_err("%d: binder_alloc_buf, no vma\n", proc->pid); return NULL; @@ -710,7 +742,8 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, return NULL; } if (is_async && - proc->free_async_space < size + sizeof(struct binder_buffer)) { + proc->alloc.free_async_space < + size + sizeof(struct binder_buffer)) { binder_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: binder_alloc_buf size %zd failed, no async space left\n", proc->pid, size); @@ -762,7 +795,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL)) return NULL; - rb_erase(best_fit, &proc->free_buffers); + rb_erase(best_fit, &proc->alloc.free_buffers); buffer->free = 0; binder_insert_allocated_buffer(proc, buffer); if (buffer_size != size) { @@ -780,10 +813,11 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, buffer->extra_buffers_size = extra_buffers_size; buffer->async_transaction = is_async; if (is_async) { - proc->free_async_space -= size + sizeof(struct binder_buffer); + proc->alloc.free_async_space -= + size + sizeof(struct binder_buffer); binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, "%d: binder_alloc_buf size %zd async free %zd\n", - proc->pid, size, proc->free_async_space); + proc->pid, size, proc->alloc.free_async_space); } return buffer; @@ -806,7 +840,7 @@ static void binder_delete_free_buffer(struct binder_proc *proc, int free_page_end = 1; int free_page_start = 1; - BUG_ON(proc->buffers.next == &buffer->entry); + BUG_ON(proc->alloc.buffers.next == &buffer->entry); prev = list_entry(buffer->entry.prev, struct binder_buffer, entry); BUG_ON(!prev->free); if (buffer_end_page(prev) == buffer_start_page(buffer)) { @@ -818,7 +852,7 @@ static void binder_delete_free_buffer(struct binder_proc *proc, proc->pid, buffer, prev); } - if (!list_is_last(&buffer->entry, &proc->buffers)) { + if (!list_is_last(&buffer->entry, &proc->alloc.buffers)) { next = list_entry(buffer->entry.next, struct binder_buffer, entry); if (buffer_start_page(next) == buffer_end_page(buffer)) { @@ -862,39 +896,40 @@ static void binder_free_buf(struct binder_proc *proc, BUG_ON(buffer->free); BUG_ON(size > buffer_size); BUG_ON(buffer->transaction != NULL); - BUG_ON((void *)buffer < proc->buffer); - BUG_ON((void *)buffer > proc->buffer + proc->buffer_size); + BUG_ON((void *)buffer < proc->alloc.buffer); + BUG_ON((void *)buffer > proc->alloc.buffer + proc->alloc.buffer_size); if (buffer->async_transaction) { - proc->free_async_space += size + sizeof(struct binder_buffer); + proc->alloc.free_async_space += + size + sizeof(struct binder_buffer); binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, "%d: binder_free_buf size %zd async free %zd\n", - proc->pid, size, proc->free_async_space); + proc->pid, size, proc->alloc.free_async_space); } binder_update_page_range(proc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK), NULL); - rb_erase(&buffer->rb_node, &proc->allocated_buffers); + rb_erase(&buffer->rb_node, &proc->alloc.allocated_buffers); buffer->free = 1; - if (!list_is_last(&buffer->entry, &proc->buffers)) { + if (!list_is_last(&buffer->entry, &proc->alloc.buffers)) { struct binder_buffer *next = list_entry(buffer->entry.next, struct binder_buffer, entry); if (next->free) { - rb_erase(&next->rb_node, &proc->free_buffers); + rb_erase(&next->rb_node, &proc->alloc.free_buffers); binder_delete_free_buffer(proc, next); } } - if (proc->buffers.next != &buffer->entry) { + if (proc->alloc.buffers.next != &buffer->entry) { struct binder_buffer *prev = list_entry(buffer->entry.prev, struct binder_buffer, entry); if (prev->free) { binder_delete_free_buffer(proc, buffer); - rb_erase(&prev->rb_node, &proc->free_buffers); + rb_erase(&prev->rb_node, &proc->alloc.free_buffers); buffer = prev; } } @@ -1533,7 +1568,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, * back to kernel address space to access it */ parent_buffer = parent->buffer - - proc->user_buffer_offset; + proc->alloc.user_buffer_offset; fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { @@ -1751,7 +1786,7 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, * Since the parent was already fixed up, convert it * back to the kernel address space to access it */ - parent_buffer = parent->buffer - target_proc->user_buffer_offset; + parent_buffer = parent->buffer - target_proc->alloc.user_buffer_offset; fd_array = (u32 *)(parent_buffer + fda->parent_offset); if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", @@ -1819,7 +1854,7 @@ static int binder_fixup_parent(struct binder_transaction *t, return -EINVAL; } parent_buffer = (u8 *)(parent->buffer - - target_proc->user_buffer_offset); + target_proc->alloc.user_buffer_offset); *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer; return 0; @@ -2159,7 +2194,7 @@ static void binder_transaction(struct binder_proc *proc, } /* Fixup buffer pointer to target proc address space */ bp->buffer = (uintptr_t)sg_bufp + - target_proc->user_buffer_offset; + target_proc->alloc.user_buffer_offset; sg_bufp += ALIGN(bp->length, sizeof(u64)); ret = binder_fixup_parent(t, thread, bp, off_start, @@ -2921,7 +2956,7 @@ retry: tr.offsets_size = t->buffer->offsets_size; tr.data.ptr.buffer = (binder_uintptr_t)( (uintptr_t)t->buffer->data + - proc->user_buffer_offset); + proc->alloc.user_buffer_offset); tr.data.ptr.offsets = tr.data.ptr.buffer + ALIGN(t->buffer->data_size, sizeof(void *)); @@ -3339,8 +3374,8 @@ static void binder_vma_close(struct vm_area_struct *vma) proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); - proc->vma = NULL; - proc->vma_vm_mm = NULL; + proc->alloc.vma = NULL; + proc->alloc.vma_vm_mm = NULL; binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); } @@ -3383,7 +3418,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE; mutex_lock(&binder_mmap_lock); - if (proc->buffer) { + if (proc->alloc.buffer) { ret = -EBUSY; failure_string = "already mapped"; goto err_already_mapped; @@ -3395,56 +3430,65 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) failure_string = "get_vm_area"; goto err_get_vm_area_failed; } - proc->buffer = area->addr; - proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer; + proc->alloc.buffer = area->addr; + proc->alloc.user_buffer_offset = + vma->vm_start - (uintptr_t)proc->alloc.buffer; mutex_unlock(&binder_mmap_lock); #ifdef CONFIG_CPU_CACHE_VIPT if (cache_is_vipt_aliasing()) { - while (CACHE_COLOUR((vma->vm_start ^ (uint32_t)proc->buffer))) { - pr_info("binder_mmap: %d %lx-%lx maps %p bad alignment\n", proc->pid, vma->vm_start, vma->vm_end, proc->buffer); + while (CACHE_COLOUR( + (vma->vm_start ^ (uint32_t)proc->alloc.buffer))) { + pr_info("binder_mmap: %d %lx-%lx maps %pK bad alignment\n", + proc->pid, vma->vm_start, + vma->vm_end, proc->alloc.buffer); vma->vm_start += PAGE_SIZE; } } #endif - proc->pages = kzalloc(sizeof(proc->pages[0]) * ((vma->vm_end - vma->vm_start) / PAGE_SIZE), GFP_KERNEL); - if (proc->pages == NULL) { + proc->alloc.pages = + kzalloc(sizeof(proc->alloc.pages[0]) * + ((vma->vm_end - vma->vm_start) / PAGE_SIZE), + GFP_KERNEL); + if (proc->alloc.pages == NULL) { ret = -ENOMEM; failure_string = "alloc page array"; goto err_alloc_pages_failed; } - proc->buffer_size = vma->vm_end - vma->vm_start; + proc->alloc.buffer_size = vma->vm_end - vma->vm_start; vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; - if (binder_update_page_range(proc, 1, proc->buffer, proc->buffer + PAGE_SIZE, vma)) { + if (binder_update_page_range(proc, 1, proc->alloc.buffer, + proc->alloc.buffer + PAGE_SIZE, vma)) { ret = -ENOMEM; failure_string = "alloc small buf"; goto err_alloc_small_buf_failed; } - buffer = proc->buffer; - INIT_LIST_HEAD(&proc->buffers); - list_add(&buffer->entry, &proc->buffers); + buffer = proc->alloc.buffer; + INIT_LIST_HEAD(&proc->alloc.buffers); + list_add(&buffer->entry, &proc->alloc.buffers); buffer->free = 1; binder_insert_free_buffer(proc, buffer); - proc->free_async_space = proc->buffer_size / 2; + proc->alloc.free_async_space = proc->alloc.buffer_size / 2; barrier(); proc->files = get_files_struct(current); - proc->vma = vma; - proc->vma_vm_mm = vma->vm_mm; + proc->alloc.vma = vma; + proc->alloc.vma_vm_mm = vma->vm_mm; - /*pr_info("binder_mmap: %d %lx-%lx maps %p\n", - proc->pid, vma->vm_start, vma->vm_end, proc->buffer);*/ + /*pr_info("binder_mmap: %d %lx-%lx maps %pK\n", + * proc->pid, vma->vm_start, vma->vm_end, proc->alloc.buffer); + */ return 0; err_alloc_small_buf_failed: - kfree(proc->pages); - proc->pages = NULL; + kfree(proc->alloc.pages); + proc->alloc.pages = NULL; err_alloc_pages_failed: mutex_lock(&binder_mmap_lock); - vfree(proc->buffer); - proc->buffer = NULL; + vfree(proc->alloc.buffer); + proc->alloc.buffer = NULL; err_get_vm_area_failed: err_already_mapped: mutex_unlock(&binder_mmap_lock); @@ -3596,7 +3640,7 @@ static void binder_deferred_release(struct binder_proc *proc) int threads, nodes, incoming_refs, outgoing_refs, buffers, active_transactions, page_count; - BUG_ON(proc->vma); + BUG_ON(proc->alloc.vma); BUG_ON(proc->files); hlist_del(&proc->proc_node); @@ -3643,7 +3687,7 @@ static void binder_deferred_release(struct binder_proc *proc) binder_release_work(&proc->delivered_death); buffers = 0; - while ((n = rb_first(&proc->allocated_buffers))) { + while ((n = rb_first(&proc->alloc.allocated_buffers))) { struct binder_buffer *buffer; buffer = rb_entry(n, struct binder_buffer, rb_node); @@ -3664,25 +3708,25 @@ static void binder_deferred_release(struct binder_proc *proc) binder_stats_deleted(BINDER_STAT_PROC); page_count = 0; - if (proc->pages) { + if (proc->alloc.pages) { int i; - for (i = 0; i < proc->buffer_size / PAGE_SIZE; i++) { + for (i = 0; i < proc->alloc.buffer_size / PAGE_SIZE; i++) { void *page_addr; - if (!proc->pages[i]) + if (!proc->alloc.pages[i]) continue; - page_addr = proc->buffer + i * PAGE_SIZE; + page_addr = proc->alloc.buffer + i * PAGE_SIZE; binder_debug(BINDER_DEBUG_BUFFER_ALLOC, "%s: %d: page %d at %p not freed\n", __func__, proc->pid, i, page_addr); unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); - __free_page(proc->pages[i]); + __free_page(proc->alloc.pages[i]); page_count++; } - kfree(proc->pages); - vfree(proc->buffer); + kfree(proc->alloc.pages); + vfree(proc->alloc.buffer); } put_task_struct(proc->tsk); @@ -3912,7 +3956,8 @@ static void print_binder_proc(struct seq_file *m, print_binder_ref(m, rb_entry(n, struct binder_ref, rb_node_desc)); } - for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n)) + for (n = rb_first(&proc->alloc.allocated_buffers); + n != NULL; n = rb_next(n)) print_binder_buffer(m, " buffer", rb_entry(n, struct binder_buffer, rb_node)); list_for_each_entry(w, &proc->todo, entry) @@ -4029,7 +4074,7 @@ static void print_binder_proc_stats(struct seq_file *m, " ready threads %d\n" " free async space %zd\n", proc->requested_threads, proc->requested_threads_started, proc->max_threads, - proc->ready_threads, proc->free_async_space); + proc->ready_threads, proc->alloc.free_async_space); count = 0; for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) count++; @@ -4047,7 +4092,8 @@ static void print_binder_proc_stats(struct seq_file *m, seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak); count = 0; - for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n)) + for (n = rb_first(&proc->alloc.allocated_buffers); + n != NULL; n = rb_next(n)) count++; seq_printf(m, " buffers: %d\n", count); diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index 7f20f3dc8369..c835f09656c1 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -280,7 +280,7 @@ TRACE_EVENT(binder_update_page_range, TP_fast_assign( __entry->proc = proc->pid; __entry->allocate = allocate; - __entry->offset = start - proc->buffer; + __entry->offset = start - proc->alloc.buffer; __entry->size = end - start; ), TP_printk("proc=%d allocate=%d offset=%zu size=%zu", From b582e88aa5c657c588da128821f434991f3ba86b Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 21 Apr 2017 11:18:12 -0700 Subject: [PATCH 0902/3220] FROMLIST: binder: remove unneeded cleanup code (from https://patchwork.kernel.org/patch/9817817/) The buffer's transaction has already been freed before binder_deferred_release. No need to do it again. Change-Id: I412709c23879c5e45a6c7304a588bba0a5223fc3 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 734edbd67410..07be2833250d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3634,7 +3634,6 @@ static int binder_node_release(struct binder_node *node, int refs) static void binder_deferred_release(struct binder_proc *proc) { - struct binder_transaction *t; struct binder_context *context = proc->context; struct rb_node *n; int threads, nodes, incoming_refs, outgoing_refs, buffers, @@ -3692,14 +3691,8 @@ static void binder_deferred_release(struct binder_proc *proc) buffer = rb_entry(n, struct binder_buffer, rb_node); - t = buffer->transaction; - if (t) { - t->buffer = NULL; - buffer->transaction = NULL; - pr_err("release proc %d, transaction %d, not freed\n", - proc->pid, t->debug_id); - /*BUG();*/ - } + /* Transaction should already have been freed */ + BUG_ON(buffer->transaction); binder_free_buf(proc, buffer); buffers++; From 467545d8424090762c38d0bb2ea88da35b94fb09 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 10 Oct 2016 10:40:53 -0700 Subject: [PATCH 0903/3220] FROMLIST: binder: separate out binder_alloc functions (from https://patchwork.kernel.org/patch/9817753/) Continuation of splitting the binder allocator from the binder driver. Separate binder_alloc functions from normal binder functions. Protect the allocator with a separate mutex. Bug: 33250092 32225111 Change-Id: I634637415aa03c74145159d84f1749bbe785a8f3 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 655 ++++++++++++++++++++------------- drivers/android/binder_trace.h | 9 +- 2 files changed, 412 insertions(+), 252 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 07be2833250d..e73481a8ea2c 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -48,7 +48,7 @@ static DEFINE_MUTEX(binder_main_lock); static DEFINE_MUTEX(binder_deferred_lock); -static DEFINE_MUTEX(binder_mmap_lock); +static DEFINE_MUTEX(binder_alloc_mmap_lock); static HLIST_HEAD(binder_devices); static HLIST_HEAD(binder_procs); @@ -104,9 +104,7 @@ enum { BINDER_DEBUG_TRANSACTION_COMPLETE = 1U << 10, BINDER_DEBUG_FREE_BUFFER = 1U << 11, BINDER_DEBUG_INTERNAL_REFS = 1U << 12, - BINDER_DEBUG_BUFFER_ALLOC = 1U << 13, - BINDER_DEBUG_PRIORITY_CAP = 1U << 14, - BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 15, + BINDER_DEBUG_PRIORITY_CAP = 1U << 13, }; static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; @@ -159,6 +157,27 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, #define to_binder_fd_array_object(hdr) \ container_of(hdr, struct binder_fd_array_object, hdr) +/* + * debug declarations for binder_alloc. To be + * moved to binder_alloc.c + */ +enum { + BINDER_ALLOC_DEBUG_OPEN_CLOSE = 1U << 1, + BINDER_ALLOC_DEBUG_BUFFER_ALLOC = 1U << 2, + BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 3, +}; +static uint32_t binder_alloc_debug_mask; + +module_param_named(alloc_debug_mask, binder_alloc_debug_mask, + uint, S_IWUSR | S_IRUGO); + +#define binder_alloc_debug(mask, x...) \ + do { \ + if (binder_alloc_debug_mask & mask) \ + pr_info(x); \ + } while (0) +/* end of binder_alloc debug declarations */ + enum binder_stat_types { BINDER_STAT_PROC, BINDER_STAT_THREAD, @@ -342,6 +361,8 @@ enum binder_deferred_state { * struct binder_buffer objects used to track the user buffers */ struct binder_alloc { + struct mutex mutex; + struct task_struct *tsk; struct vm_area_struct *vma; struct mm_struct *vma_vm_mm; void *buffer; @@ -352,6 +373,7 @@ struct binder_alloc { size_t free_async_space; struct page **pages; size_t buffer_size; + int pid; }; struct binder_proc { @@ -423,6 +445,56 @@ struct binder_transaction { kuid_t sender_euid; }; +/* + * Forward declarations of binder_alloc functions. + * These will be moved to binder_alloc.h when + * binder_alloc is moved to its own files. + */ +extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, + size_t data_size, + size_t offsets_size, + size_t extra_buffers_size, + int is_async); +extern void binder_alloc_init(struct binder_alloc *alloc); +extern void binder_alloc_vma_close(struct binder_alloc *alloc); +extern struct binder_buffer * +binder_alloc_buffer_lookup(struct binder_alloc *alloc, + uintptr_t user_ptr); +extern void binder_alloc_free_buf(struct binder_alloc *alloc, + struct binder_buffer *buffer); +extern int binder_alloc_mmap_handler(struct binder_alloc *alloc, + struct vm_area_struct *vma); +extern void binder_alloc_deferred_release(struct binder_alloc *alloc); +extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc); +extern void binder_alloc_print_allocated(struct seq_file *m, + struct binder_alloc *alloc); + +static inline size_t +binder_alloc_get_free_async_space(struct binder_alloc *alloc) +{ + size_t free_async_space; + + mutex_lock(&alloc->mutex); + free_async_space = alloc->free_async_space; + mutex_unlock(&alloc->mutex); + return free_async_space; +} + +static inline ptrdiff_t +binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc) +{ + /* + * user_buffer_offset is constant if vma is set and + * undefined if vma is not set. It is possible to + * get here with !alloc->vma if the target process + * is dying while a transaction is being initiated. + * Returning the old value is ok in this case and + * the transaction will fail. + */ + return alloc->user_buffer_offset; +} +/* end of binder_alloc declarations */ + static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); @@ -506,21 +578,20 @@ static void binder_set_nice(long nice) binder_user_error("%d RLIMIT_NICE not set\n", current->pid); } -static size_t binder_buffer_size(struct binder_proc *proc, - struct binder_buffer *buffer) +static size_t binder_alloc_buffer_size(struct binder_alloc *alloc, + struct binder_buffer *buffer) { - if (list_is_last(&buffer->entry, &proc->alloc.buffers)) - return proc->alloc.buffer + - proc->alloc.buffer_size - - (void *)buffer->data; + if (list_is_last(&buffer->entry, &alloc->buffers)) + return alloc->buffer + + alloc->buffer_size - (void *)buffer->data; return (size_t)list_entry(buffer->entry.next, struct binder_buffer, entry) - (size_t)buffer->data; } -static void binder_insert_free_buffer(struct binder_proc *proc, +static void binder_insert_free_buffer(struct binder_alloc *alloc, struct binder_buffer *new_buffer) { - struct rb_node **p = &proc->alloc.free_buffers.rb_node; + struct rb_node **p = &alloc->free_buffers.rb_node; struct rb_node *parent = NULL; struct binder_buffer *buffer; size_t buffer_size; @@ -528,18 +599,18 @@ static void binder_insert_free_buffer(struct binder_proc *proc, BUG_ON(!new_buffer->free); - new_buffer_size = binder_buffer_size(proc, new_buffer); + new_buffer_size = binder_alloc_buffer_size(alloc, new_buffer); - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: add free buffer, size %zd, at %p\n", - proc->pid, new_buffer_size, new_buffer); + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%d: add free buffer, size %zd, at %pK\n", + alloc->pid, new_buffer_size, new_buffer); while (*p) { parent = *p; buffer = rb_entry(parent, struct binder_buffer, rb_node); BUG_ON(!buffer->free); - buffer_size = binder_buffer_size(proc, buffer); + buffer_size = binder_alloc_buffer_size(alloc, buffer); if (new_buffer_size < buffer_size) p = &parent->rb_left; @@ -547,13 +618,13 @@ static void binder_insert_free_buffer(struct binder_proc *proc, p = &parent->rb_right; } rb_link_node(&new_buffer->rb_node, parent, p); - rb_insert_color(&new_buffer->rb_node, &proc->alloc.free_buffers); + rb_insert_color(&new_buffer->rb_node, &alloc->free_buffers); } -static void binder_insert_allocated_buffer(struct binder_proc *proc, +static void binder_insert_allocated_buffer(struct binder_alloc *alloc, struct binder_buffer *new_buffer) { - struct rb_node **p = &proc->alloc.allocated_buffers.rb_node; + struct rb_node **p = &alloc->allocated_buffers.rb_node; struct rb_node *parent = NULL; struct binder_buffer *buffer; @@ -572,19 +643,19 @@ static void binder_insert_allocated_buffer(struct binder_proc *proc, BUG(); } rb_link_node(&new_buffer->rb_node, parent, p); - rb_insert_color(&new_buffer->rb_node, &proc->alloc.allocated_buffers); + rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers); } -static struct binder_buffer *binder_buffer_lookup(struct binder_proc *proc, - uintptr_t user_ptr) +static struct binder_buffer *binder_alloc_buffer_lookup_locked( + struct binder_alloc *alloc, + uintptr_t user_ptr) { - struct rb_node *n = proc->alloc.allocated_buffers.rb_node; + struct rb_node *n = alloc->allocated_buffers.rb_node; struct binder_buffer *buffer; struct binder_buffer *kern_ptr; - kern_ptr = (struct binder_buffer *) - (user_ptr - proc->alloc.user_buffer_offset - - offsetof(struct binder_buffer, data)); + kern_ptr = (struct binder_buffer *)(user_ptr - alloc->user_buffer_offset + - offsetof(struct binder_buffer, data)); while (n) { buffer = rb_entry(n, struct binder_buffer, rb_node); @@ -600,7 +671,18 @@ static struct binder_buffer *binder_buffer_lookup(struct binder_proc *proc, return NULL; } -static int binder_update_page_range(struct binder_proc *proc, int allocate, +struct binder_buffer *binder_alloc_buffer_lookup(struct binder_alloc *alloc, + uintptr_t user_ptr) +{ + struct binder_buffer *buffer; + + mutex_lock(&alloc->mutex); + buffer = binder_alloc_buffer_lookup_locked(alloc, user_ptr); + mutex_unlock(&alloc->mutex); + return buffer; +} + +static int binder_update_page_range(struct binder_alloc *alloc, int allocate, void *start, void *end, struct vm_area_struct *vma) { @@ -609,26 +691,26 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, struct page **page; struct mm_struct *mm; - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: %s pages %p-%p\n", proc->pid, + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%d: %s pages %pK-%pK\n", alloc->pid, allocate ? "allocate" : "free", start, end); if (end <= start) return 0; - trace_binder_update_page_range(proc, allocate, start, end); + trace_binder_update_page_range(alloc, allocate, start, end); if (vma) mm = NULL; else - mm = get_task_mm(proc->tsk); + mm = get_task_mm(alloc->tsk); if (mm) { down_write(&mm->mmap_sem); - vma = proc->alloc.vma; - if (vma && mm != proc->alloc.vma_vm_mm) { + vma = alloc->vma; + if (vma && mm != alloc->vma_vm_mm) { pr_err("%d: vma mm and task mm mismatch\n", - proc->pid); + alloc->pid); vma = NULL; } } @@ -638,21 +720,20 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, if (vma == NULL) { pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n", - proc->pid); + alloc->pid); goto err_no_vma; } for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; - page = &proc->alloc.pages[ - (page_addr - proc->alloc.buffer) / PAGE_SIZE]; + page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; BUG_ON(*page); *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); if (*page == NULL) { - pr_err("%d: binder_alloc_buf failed for page at %p\n", - proc->pid, page_addr); + pr_err("%d: binder_alloc_buf failed for page at %pK\n", + alloc->pid, page_addr); goto err_alloc_page_failed; } ret = map_kernel_range_noflush((unsigned long)page_addr, @@ -660,16 +741,16 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, flush_cache_vmap((unsigned long)page_addr, (unsigned long)page_addr + PAGE_SIZE); if (ret != 1) { - pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n", - proc->pid, page_addr); + pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n", + alloc->pid, page_addr); goto err_map_kernel_failed; } user_page_addr = - (uintptr_t)page_addr + proc->alloc.user_buffer_offset; + (uintptr_t)page_addr + alloc->user_buffer_offset; ret = vm_insert_page(vma, user_page_addr, page[0]); if (ret) { pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n", - proc->pid, user_page_addr); + alloc->pid, user_page_addr); goto err_vm_insert_page_failed; } /* vm_insert_page does not seem to increment the refcount */ @@ -683,14 +764,10 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, free_range: for (page_addr = end - PAGE_SIZE; page_addr >= start; page_addr -= PAGE_SIZE) { - - page = &proc->alloc.pages[ - (page_addr - proc->alloc.buffer) / PAGE_SIZE]; + page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; if (vma) - zap_page_range(vma, - (uintptr_t)page_addr + - proc->alloc.user_buffer_offset, - PAGE_SIZE, NULL); + zap_page_range(vma, (uintptr_t)page_addr + + alloc->user_buffer_offset, PAGE_SIZE, NULL); err_vm_insert_page_failed: unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); err_map_kernel_failed: @@ -707,13 +784,11 @@ err_no_vma: return -ENOMEM; } -static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, - size_t data_size, - size_t offsets_size, - size_t extra_buffers_size, - int is_async) +static struct binder_buffer *binder_alloc_new_buf_locked( + struct binder_alloc *alloc, size_t data_size, + size_t offsets_size, size_t extra_buffers_size, int is_async) { - struct rb_node *n = proc->alloc.free_buffers.rb_node; + struct rb_node *n = alloc->free_buffers.rb_node; struct binder_buffer *buffer; size_t buffer_size; struct rb_node *best_fit = NULL; @@ -721,9 +796,9 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, void *end_page_addr; size_t size, data_offsets_size; - if (proc->alloc.vma == NULL) { + if (alloc->vma == NULL) { pr_err("%d: binder_alloc_buf, no vma\n", - proc->pid); + alloc->pid); return NULL; } @@ -731,29 +806,30 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, ALIGN(offsets_size, sizeof(void *)); if (data_offsets_size < data_size || data_offsets_size < offsets_size) { - binder_user_error("%d: got transaction with invalid size %zd-%zd\n", - proc->pid, data_size, offsets_size); + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%d: got transaction with invalid size %zd-%zd\n", + alloc->pid, data_size, offsets_size); return NULL; } size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *)); if (size < data_offsets_size || size < extra_buffers_size) { - binder_user_error("%d: got transaction with invalid extra_buffers_size %zd\n", - proc->pid, extra_buffers_size); + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%d: got transaction with invalid extra_buffers_size %zd\n", + alloc->pid, extra_buffers_size); return NULL; } if (is_async && - proc->alloc.free_async_space < - size + sizeof(struct binder_buffer)) { - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + alloc->free_async_space < size + sizeof(struct binder_buffer)) { + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, "%d: binder_alloc_buf size %zd failed, no async space left\n", - proc->pid, size); + alloc->pid, size); return NULL; } while (n) { buffer = rb_entry(n, struct binder_buffer, rb_node); BUG_ON(!buffer->free); - buffer_size = binder_buffer_size(proc, buffer); + buffer_size = binder_alloc_buffer_size(alloc, buffer); if (size < buffer_size) { best_fit = n; @@ -767,17 +843,17 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, } if (best_fit == NULL) { pr_err("%d: binder_alloc_buf size %zd failed, no address space\n", - proc->pid, size); + alloc->pid, size); return NULL; } if (n == NULL) { buffer = rb_entry(best_fit, struct binder_buffer, rb_node); - buffer_size = binder_buffer_size(proc, buffer); + buffer_size = binder_alloc_buffer_size(alloc, buffer); } - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: binder_alloc_buf size %zd got buffer %p size %zd\n", - proc->pid, size, buffer, buffer_size); + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n", + alloc->pid, size, buffer, buffer_size); has_page_addr = (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK); @@ -791,38 +867,52 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size); if (end_page_addr > has_page_addr) end_page_addr = has_page_addr; - if (binder_update_page_range(proc, 1, + if (binder_update_page_range(alloc, 1, (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL)) return NULL; - rb_erase(best_fit, &proc->alloc.free_buffers); + rb_erase(best_fit, &alloc->free_buffers); buffer->free = 0; - binder_insert_allocated_buffer(proc, buffer); + binder_insert_allocated_buffer(alloc, buffer); if (buffer_size != size) { struct binder_buffer *new_buffer = (void *)buffer->data + size; list_add(&new_buffer->entry, &buffer->entry); new_buffer->free = 1; - binder_insert_free_buffer(proc, new_buffer); + binder_insert_free_buffer(alloc, new_buffer); } - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: binder_alloc_buf size %zd got %p\n", - proc->pid, size, buffer); + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%d: binder_alloc_buf size %zd got %pK\n", + alloc->pid, size, buffer); buffer->data_size = data_size; buffer->offsets_size = offsets_size; - buffer->extra_buffers_size = extra_buffers_size; buffer->async_transaction = is_async; + buffer->extra_buffers_size = extra_buffers_size; if (is_async) { - proc->alloc.free_async_space -= - size + sizeof(struct binder_buffer); - binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, + alloc->free_async_space -= size + sizeof(struct binder_buffer); + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC, "%d: binder_alloc_buf size %zd async free %zd\n", - proc->pid, size, proc->alloc.free_async_space); + alloc->pid, size, alloc->free_async_space); } return buffer; } +struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, + size_t data_size, + size_t offsets_size, + size_t extra_buffers_size, + int is_async) +{ + struct binder_buffer *buffer; + + mutex_lock(&alloc->mutex); + buffer = binder_alloc_new_buf_locked(alloc, data_size, offsets_size, + extra_buffers_size, is_async); + mutex_unlock(&alloc->mutex); + return buffer; +} + static void *buffer_start_page(struct binder_buffer *buffer) { return (void *)((uintptr_t)buffer & PAGE_MASK); @@ -833,26 +923,26 @@ static void *buffer_end_page(struct binder_buffer *buffer) return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK); } -static void binder_delete_free_buffer(struct binder_proc *proc, +static void binder_delete_free_buffer(struct binder_alloc *alloc, struct binder_buffer *buffer) { struct binder_buffer *prev, *next = NULL; int free_page_end = 1; int free_page_start = 1; - BUG_ON(proc->alloc.buffers.next == &buffer->entry); + BUG_ON(alloc->buffers.next == &buffer->entry); prev = list_entry(buffer->entry.prev, struct binder_buffer, entry); BUG_ON(!prev->free); if (buffer_end_page(prev) == buffer_start_page(buffer)) { free_page_start = 0; if (buffer_end_page(prev) == buffer_end_page(buffer)) free_page_end = 0; - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %p share page with %p\n", - proc->pid, buffer, prev); + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %pK share page with %pK\n", + alloc->pid, buffer, prev); } - if (!list_is_last(&buffer->entry, &proc->alloc.buffers)) { + if (!list_is_last(&buffer->entry, &alloc->buffers)) { next = list_entry(buffer->entry.next, struct binder_buffer, entry); if (buffer_start_page(next) == buffer_end_page(buffer)) { @@ -860,80 +950,88 @@ static void binder_delete_free_buffer(struct binder_proc *proc, if (buffer_start_page(next) == buffer_start_page(buffer)) free_page_start = 0; - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %p share page with %p\n", - proc->pid, buffer, prev); + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %pK share page with %pK\n", + alloc->pid, buffer, prev); } } list_del(&buffer->entry); if (free_page_start || free_page_end) { - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %p do not share page%s%s with %p or %p\n", - proc->pid, buffer, free_page_start ? "" : " end", + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %pK do not share page%s%s with %pK or %pK\n", + alloc->pid, buffer, free_page_start ? "" : " end", free_page_end ? "" : " start", prev, next); - binder_update_page_range(proc, 0, free_page_start ? + binder_update_page_range(alloc, 0, free_page_start ? buffer_start_page(buffer) : buffer_end_page(buffer), (free_page_end ? buffer_end_page(buffer) : buffer_start_page(buffer)) + PAGE_SIZE, NULL); } } -static void binder_free_buf(struct binder_proc *proc, - struct binder_buffer *buffer) +static void binder_free_buf_locked(struct binder_alloc *alloc, + struct binder_buffer *buffer) { size_t size, buffer_size; - buffer_size = binder_buffer_size(proc, buffer); + buffer_size = binder_alloc_buffer_size(alloc, buffer); size = ALIGN(buffer->data_size, sizeof(void *)) + ALIGN(buffer->offsets_size, sizeof(void *)) + ALIGN(buffer->extra_buffers_size, sizeof(void *)); - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: binder_free_buf %p size %zd buffer_size %zd\n", - proc->pid, buffer, size, buffer_size); + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%d: binder_free_buf %pK size %zd buffer_size %zd\n", + alloc->pid, buffer, size, buffer_size); BUG_ON(buffer->free); BUG_ON(size > buffer_size); BUG_ON(buffer->transaction != NULL); - BUG_ON((void *)buffer < proc->alloc.buffer); - BUG_ON((void *)buffer > proc->alloc.buffer + proc->alloc.buffer_size); + BUG_ON((void *)buffer < alloc->buffer); + BUG_ON((void *)buffer > alloc->buffer + alloc->buffer_size); if (buffer->async_transaction) { - proc->alloc.free_async_space += - size + sizeof(struct binder_buffer); + alloc->free_async_space += size + sizeof(struct binder_buffer); - binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC, "%d: binder_free_buf size %zd async free %zd\n", - proc->pid, size, proc->alloc.free_async_space); + alloc->pid, size, alloc->free_async_space); } - binder_update_page_range(proc, 0, + binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK), NULL); - rb_erase(&buffer->rb_node, &proc->alloc.allocated_buffers); + + rb_erase(&buffer->rb_node, &alloc->allocated_buffers); buffer->free = 1; - if (!list_is_last(&buffer->entry, &proc->alloc.buffers)) { + if (!list_is_last(&buffer->entry, &alloc->buffers)) { struct binder_buffer *next = list_entry(buffer->entry.next, struct binder_buffer, entry); if (next->free) { - rb_erase(&next->rb_node, &proc->alloc.free_buffers); - binder_delete_free_buffer(proc, next); + rb_erase(&next->rb_node, &alloc->free_buffers); + binder_delete_free_buffer(alloc, next); } } - if (proc->alloc.buffers.next != &buffer->entry) { + if (alloc->buffers.next != &buffer->entry) { struct binder_buffer *prev = list_entry(buffer->entry.prev, struct binder_buffer, entry); if (prev->free) { - binder_delete_free_buffer(proc, buffer); - rb_erase(&prev->rb_node, &proc->alloc.free_buffers); + binder_delete_free_buffer(alloc, buffer); + rb_erase(&prev->rb_node, &alloc->free_buffers); buffer = prev; } } - binder_insert_free_buffer(proc, buffer); + binder_insert_free_buffer(alloc, buffer); +} + +void binder_alloc_free_buf(struct binder_alloc *alloc, + struct binder_buffer *buffer) +{ + mutex_lock(&alloc->mutex); + binder_free_buf_locked(alloc, buffer); + mutex_unlock(&alloc->mutex); } static struct binder_node *binder_get_node(struct binder_proc *proc, @@ -1568,7 +1666,8 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, * back to kernel address space to access it */ parent_buffer = parent->buffer - - proc->alloc.user_buffer_offset; + binder_alloc_get_user_buffer_offset( + &proc->alloc); fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { @@ -1786,7 +1885,8 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, * Since the parent was already fixed up, convert it * back to the kernel address space to access it */ - parent_buffer = parent->buffer - target_proc->alloc.user_buffer_offset; + parent_buffer = parent->buffer - + binder_alloc_get_user_buffer_offset(&target_proc->alloc); fd_array = (u32 *)(parent_buffer + fda->parent_offset); if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", @@ -1854,7 +1954,8 @@ static int binder_fixup_parent(struct binder_transaction *t, return -EINVAL; } parent_buffer = (u8 *)(parent->buffer - - target_proc->alloc.user_buffer_offset); + binder_alloc_get_user_buffer_offset( + &target_proc->alloc)); *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer; return 0; @@ -2040,7 +2141,7 @@ static void binder_transaction(struct binder_proc *proc, trace_binder_transaction(reply, t, target_node); - t->buffer = binder_alloc_buf(target_proc, tr->data_size, + t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size, tr->offsets_size, extra_buffers_size, !reply && (t->flags & TF_ONE_WAY)); if (t->buffer == NULL) { @@ -2194,7 +2295,8 @@ static void binder_transaction(struct binder_proc *proc, } /* Fixup buffer pointer to target proc address space */ bp->buffer = (uintptr_t)sg_bufp + - target_proc->alloc.user_buffer_offset; + binder_alloc_get_user_buffer_offset( + &target_proc->alloc); sg_bufp += ALIGN(bp->length, sizeof(u64)); ret = binder_fixup_parent(t, thread, bp, off_start, @@ -2252,7 +2354,7 @@ err_copy_data_failed: trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, t->buffer, offp); t->buffer->transaction = NULL; - binder_free_buf(target_proc, t->buffer); + binder_alloc_free_buf(&target_proc->alloc, t->buffer); err_binder_alloc_buf_failed: kfree(tcomplete); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); @@ -2432,7 +2534,8 @@ static int binder_thread_write(struct binder_proc *proc, return -EFAULT; ptr += sizeof(binder_uintptr_t); - buffer = binder_buffer_lookup(proc, data_ptr); + buffer = binder_alloc_buffer_lookup(&proc->alloc, + data_ptr); if (buffer == NULL) { binder_user_error("%d:%d BC_FREE_BUFFER u%016llx no match\n", proc->pid, thread->pid, (u64)data_ptr); @@ -2462,7 +2565,7 @@ static int binder_thread_write(struct binder_proc *proc, } trace_binder_transaction_buffer_release(buffer); binder_transaction_buffer_release(proc, buffer, NULL); - binder_free_buf(proc, buffer); + binder_alloc_free_buf(&proc->alloc, buffer); break; } @@ -2954,9 +3057,9 @@ retry: tr.data_size = t->buffer->data_size; tr.offsets_size = t->buffer->offsets_size; - tr.data.ptr.buffer = (binder_uintptr_t)( - (uintptr_t)t->buffer->data + - proc->alloc.user_buffer_offset); + tr.data.ptr.buffer = (binder_uintptr_t) + ((uintptr_t)t->buffer->data + + binder_alloc_get_user_buffer_offset(&proc->alloc)); tr.data.ptr.offsets = tr.data.ptr.buffer + ALIGN(t->buffer->data_size, sizeof(void *)); @@ -3365,6 +3468,12 @@ static void binder_vma_open(struct vm_area_struct *vma) (unsigned long)pgprot_val(vma->vm_page_prot)); } +void binder_alloc_vma_close(struct binder_alloc *alloc) +{ + WRITE_ONCE(alloc->vma, NULL); + WRITE_ONCE(alloc->vma_vm_mm, NULL); +} + static void binder_vma_close(struct vm_area_struct *vma) { struct binder_proc *proc = vma->vm_private_data; @@ -3374,8 +3483,7 @@ static void binder_vma_close(struct vm_area_struct *vma) proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); - proc->alloc.vma = NULL; - proc->alloc.vma_vm_mm = NULL; + binder_alloc_vma_close(&proc->alloc); binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); } @@ -3390,35 +3498,16 @@ static const struct vm_operations_struct binder_vm_ops = { .fault = binder_vm_fault, }; -static int binder_mmap(struct file *filp, struct vm_area_struct *vma) +int binder_alloc_mmap_handler(struct binder_alloc *alloc, + struct vm_area_struct *vma) { int ret; struct vm_struct *area; - struct binder_proc *proc = filp->private_data; const char *failure_string; struct binder_buffer *buffer; - if (proc->tsk != current->group_leader) - return -EINVAL; - - if ((vma->vm_end - vma->vm_start) > SZ_4M) - vma->vm_end = vma->vm_start + SZ_4M; - - binder_debug(BINDER_DEBUG_OPEN_CLOSE, - "binder_mmap: %d %lx-%lx (%ld K) vma %lx pagep %lx\n", - proc->pid, vma->vm_start, vma->vm_end, - (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, - (unsigned long)pgprot_val(vma->vm_page_prot)); - - if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) { - ret = -EPERM; - failure_string = "bad vm_flags"; - goto err_bad_arg; - } - vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE; - - mutex_lock(&binder_mmap_lock); - if (proc->alloc.buffer) { + mutex_lock(&binder_alloc_mmap_lock); + if (alloc->buffer) { ret = -EBUSY; failure_string = "already mapped"; goto err_already_mapped; @@ -3430,74 +3519,111 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) failure_string = "get_vm_area"; goto err_get_vm_area_failed; } - proc->alloc.buffer = area->addr; - proc->alloc.user_buffer_offset = - vma->vm_start - (uintptr_t)proc->alloc.buffer; - mutex_unlock(&binder_mmap_lock); + alloc->buffer = area->addr; + alloc->user_buffer_offset = + vma->vm_start - (uintptr_t)alloc->buffer; + mutex_unlock(&binder_alloc_mmap_lock); #ifdef CONFIG_CPU_CACHE_VIPT if (cache_is_vipt_aliasing()) { while (CACHE_COLOUR( - (vma->vm_start ^ (uint32_t)proc->alloc.buffer))) { + (vma->vm_start ^ (uint32_t)alloc->buffer))) { pr_info("binder_mmap: %d %lx-%lx maps %pK bad alignment\n", - proc->pid, vma->vm_start, - vma->vm_end, proc->alloc.buffer); + alloc->pid, vma->vm_start, vma->vm_end, + alloc->buffer); vma->vm_start += PAGE_SIZE; } } #endif - proc->alloc.pages = - kzalloc(sizeof(proc->alloc.pages[0]) * - ((vma->vm_end - vma->vm_start) / PAGE_SIZE), - GFP_KERNEL); - if (proc->alloc.pages == NULL) { + alloc->pages = kzalloc(sizeof(alloc->pages[0]) * + ((vma->vm_end - vma->vm_start) / PAGE_SIZE), + GFP_KERNEL); + if (alloc->pages == NULL) { ret = -ENOMEM; failure_string = "alloc page array"; goto err_alloc_pages_failed; } - proc->alloc.buffer_size = vma->vm_end - vma->vm_start; + alloc->buffer_size = vma->vm_end - vma->vm_start; - vma->vm_ops = &binder_vm_ops; - vma->vm_private_data = proc; - - if (binder_update_page_range(proc, 1, proc->alloc.buffer, - proc->alloc.buffer + PAGE_SIZE, vma)) { + if (binder_update_page_range(alloc, 1, alloc->buffer, + alloc->buffer + PAGE_SIZE, vma)) { ret = -ENOMEM; failure_string = "alloc small buf"; goto err_alloc_small_buf_failed; } - buffer = proc->alloc.buffer; - INIT_LIST_HEAD(&proc->alloc.buffers); - list_add(&buffer->entry, &proc->alloc.buffers); + buffer = alloc->buffer; + INIT_LIST_HEAD(&alloc->buffers); + list_add(&buffer->entry, &alloc->buffers); buffer->free = 1; - binder_insert_free_buffer(proc, buffer); - proc->alloc.free_async_space = proc->alloc.buffer_size / 2; + binder_insert_free_buffer(alloc, buffer); + alloc->free_async_space = alloc->buffer_size / 2; barrier(); - proc->files = get_files_struct(current); - proc->alloc.vma = vma; - proc->alloc.vma_vm_mm = vma->vm_mm; + alloc->vma = vma; + alloc->vma_vm_mm = vma->vm_mm; - /*pr_info("binder_mmap: %d %lx-%lx maps %pK\n", - * proc->pid, vma->vm_start, vma->vm_end, proc->alloc.buffer); - */ return 0; err_alloc_small_buf_failed: - kfree(proc->alloc.pages); - proc->alloc.pages = NULL; + kfree(alloc->pages); + alloc->pages = NULL; err_alloc_pages_failed: - mutex_lock(&binder_mmap_lock); - vfree(proc->alloc.buffer); - proc->alloc.buffer = NULL; + mutex_lock(&binder_alloc_mmap_lock); + vfree(alloc->buffer); + alloc->buffer = NULL; err_get_vm_area_failed: err_already_mapped: - mutex_unlock(&binder_mmap_lock); + mutex_unlock(&binder_alloc_mmap_lock); + pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, + alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret); + return ret; +} + +static int binder_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int ret; + struct binder_proc *proc = filp->private_data; + const char *failure_string; + + if (proc->tsk != current->group_leader) + return -EINVAL; + + if ((vma->vm_end - vma->vm_start) > SZ_4M) + vma->vm_end = vma->vm_start + SZ_4M; + + binder_debug(BINDER_DEBUG_OPEN_CLOSE, + "%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n", + __func__, proc->pid, vma->vm_start, vma->vm_end, + (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, + (unsigned long)pgprot_val(vma->vm_page_prot)); + + if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) { + ret = -EPERM; + failure_string = "bad vm_flags"; + goto err_bad_arg; + } + vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE; + vma->vm_ops = &binder_vm_ops; + vma->vm_private_data = proc; + + ret = binder_alloc_mmap_handler(&proc->alloc, vma); + if (ret) + return ret; + proc->files = get_files_struct(current); + return 0; + err_bad_arg: pr_err("binder_mmap: %d %lx-%lx %s failed %d\n", proc->pid, vma->vm_start, vma->vm_end, failure_string, ret); return ret; } +void binder_alloc_init(struct binder_alloc *alloc) +{ + alloc->tsk = current->group_leader; + alloc->pid = current->group_leader->pid; + mutex_init(&alloc->mutex); +} + static int binder_open(struct inode *nodp, struct file *filp) { struct binder_proc *proc; @@ -3517,6 +3643,7 @@ static int binder_open(struct inode *nodp, struct file *filp) binder_dev = container_of(filp->private_data, struct binder_device, miscdev); proc->context = &binder_dev->context; + binder_alloc_init(&proc->alloc); binder_lock(__func__); @@ -3632,14 +3759,61 @@ static int binder_node_release(struct binder_node *node, int refs) return refs; } +void binder_alloc_deferred_release(struct binder_alloc *alloc) +{ + struct rb_node *n; + int buffers, page_count; + + BUG_ON(alloc->vma); + + buffers = 0; + mutex_lock(&alloc->mutex); + while ((n = rb_first(&alloc->allocated_buffers))) { + struct binder_buffer *buffer; + + buffer = rb_entry(n, struct binder_buffer, rb_node); + + /* Transaction should already have been freed */ + BUG_ON(buffer->transaction); + + binder_free_buf_locked(alloc, buffer); + buffers++; + } + + page_count = 0; + if (alloc->pages) { + int i; + + for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { + void *page_addr; + + if (!alloc->pages[i]) + continue; + + page_addr = alloc->buffer + i * PAGE_SIZE; + binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, + "%s: %d: page %d at %pK not freed\n", + __func__, alloc->pid, i, page_addr); + unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); + __free_page(alloc->pages[i]); + page_count++; + } + kfree(alloc->pages); + vfree(alloc->buffer); + } + mutex_unlock(&alloc->mutex); + + binder_alloc_debug(BINDER_ALLOC_DEBUG_OPEN_CLOSE, + "%s: %d buffers %d, pages %d\n", + __func__, alloc->pid, buffers, page_count); +} + static void binder_deferred_release(struct binder_proc *proc) { struct binder_context *context = proc->context; struct rb_node *n; - int threads, nodes, incoming_refs, outgoing_refs, buffers, - active_transactions, page_count; + int threads, nodes, incoming_refs, outgoing_refs, active_transactions; - BUG_ON(proc->alloc.vma); BUG_ON(proc->files); hlist_del(&proc->proc_node); @@ -3685,49 +3859,15 @@ static void binder_deferred_release(struct binder_proc *proc) binder_release_work(&proc->todo); binder_release_work(&proc->delivered_death); - buffers = 0; - while ((n = rb_first(&proc->alloc.allocated_buffers))) { - struct binder_buffer *buffer; - - buffer = rb_entry(n, struct binder_buffer, rb_node); - - /* Transaction should already have been freed */ - BUG_ON(buffer->transaction); - - binder_free_buf(proc, buffer); - buffers++; - } - + binder_alloc_deferred_release(&proc->alloc); binder_stats_deleted(BINDER_STAT_PROC); - page_count = 0; - if (proc->alloc.pages) { - int i; - - for (i = 0; i < proc->alloc.buffer_size / PAGE_SIZE; i++) { - void *page_addr; - - if (!proc->alloc.pages[i]) - continue; - - page_addr = proc->alloc.buffer + i * PAGE_SIZE; - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%s: %d: page %d at %p not freed\n", - __func__, proc->pid, i, page_addr); - unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); - __free_page(proc->alloc.pages[i]); - page_count++; - } - kfree(proc->alloc.pages); - vfree(proc->alloc.buffer); - } - put_task_struct(proc->tsk); binder_debug(BINDER_DEBUG_OPEN_CLOSE, - "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d, buffers %d, pages %d\n", + "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n", __func__, proc->pid, threads, nodes, incoming_refs, - outgoing_refs, active_transactions, buffers, page_count); + outgoing_refs, active_transactions); kfree(proc); } @@ -3810,15 +3950,6 @@ static void print_binder_transaction(struct seq_file *m, const char *prefix, t->buffer->data); } -static void print_binder_buffer(struct seq_file *m, const char *prefix, - struct binder_buffer *buffer) -{ - seq_printf(m, "%s %d: %p size %zd:%zd %s\n", - prefix, buffer->debug_id, buffer->data, - buffer->data_size, buffer->offsets_size, - buffer->transaction ? "active" : "delivered"); -} - static void print_binder_work(struct seq_file *m, const char *prefix, const char *transaction_prefix, struct binder_work *w) @@ -3921,6 +4052,27 @@ static void print_binder_ref(struct seq_file *m, struct binder_ref *ref) ref->node->debug_id, ref->strong, ref->weak, ref->death); } +static void print_binder_buffer(struct seq_file *m, const char *prefix, + struct binder_buffer *buffer) +{ + seq_printf(m, "%s %d: %pK size %zd:%zd %s\n", + prefix, buffer->debug_id, buffer->data, + buffer->data_size, buffer->offsets_size, + buffer->transaction ? "active" : "delivered"); +} + +void binder_alloc_print_allocated(struct seq_file *m, + struct binder_alloc *alloc) +{ + struct rb_node *n; + + mutex_lock(&alloc->mutex); + for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n)) + print_binder_buffer(m, " buffer", + rb_entry(n, struct binder_buffer, rb_node)); + mutex_unlock(&alloc->mutex); +} + static void print_binder_proc(struct seq_file *m, struct binder_proc *proc, int print_all) { @@ -3949,10 +4101,7 @@ static void print_binder_proc(struct seq_file *m, print_binder_ref(m, rb_entry(n, struct binder_ref, rb_node_desc)); } - for (n = rb_first(&proc->alloc.allocated_buffers); - n != NULL; n = rb_next(n)) - print_binder_buffer(m, " buffer", - rb_entry(n, struct binder_buffer, rb_node)); + binder_alloc_print_allocated(m, &proc->alloc); list_for_each_entry(w, &proc->todo, entry) print_binder_work(m, " ", " pending transaction", w); list_for_each_entry(w, &proc->delivered_death, entry) { @@ -4050,6 +4199,18 @@ static void print_binder_stats(struct seq_file *m, const char *prefix, } } +int binder_alloc_get_allocated_count(struct binder_alloc *alloc) +{ + struct rb_node *n; + int count = 0; + + mutex_lock(&alloc->mutex); + for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n)) + count++; + mutex_unlock(&alloc->mutex); + return count; +} + static void print_binder_proc_stats(struct seq_file *m, struct binder_proc *proc) { @@ -4067,7 +4228,8 @@ static void print_binder_proc_stats(struct seq_file *m, " ready threads %d\n" " free async space %zd\n", proc->requested_threads, proc->requested_threads_started, proc->max_threads, - proc->ready_threads, proc->alloc.free_async_space); + proc->ready_threads, + binder_alloc_get_free_async_space(&proc->alloc)); count = 0; for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) count++; @@ -4084,10 +4246,7 @@ static void print_binder_proc_stats(struct seq_file *m, } seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak); - count = 0; - for (n = rb_first(&proc->alloc.allocated_buffers); - n != NULL; n = rb_next(n)) - count++; + count = binder_alloc_get_allocated_count(&proc->alloc); seq_printf(m, " buffers: %d\n", count); count = 0; diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index c835f09656c1..50b0d21f42cf 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -23,6 +23,7 @@ struct binder_buffer; struct binder_node; struct binder_proc; +struct binder_alloc; struct binder_ref; struct binder_thread; struct binder_transaction; @@ -268,9 +269,9 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release, TP_ARGS(buffer)); TRACE_EVENT(binder_update_page_range, - TP_PROTO(struct binder_proc *proc, bool allocate, + TP_PROTO(struct binder_alloc *alloc, bool allocate, void *start, void *end), - TP_ARGS(proc, allocate, start, end), + TP_ARGS(alloc, allocate, start, end), TP_STRUCT__entry( __field(int, proc) __field(bool, allocate) @@ -278,9 +279,9 @@ TRACE_EVENT(binder_update_page_range, __field(size_t, size) ), TP_fast_assign( - __entry->proc = proc->pid; + __entry->proc = alloc->pid; __entry->allocate = allocate; - __entry->offset = start - proc->alloc.buffer; + __entry->offset = start - alloc->buffer; __entry->size = end - start; ), TP_printk("proc=%d allocate=%d offset=%zu size=%zu", From 2324f70c5aab8b8e5f4c100a80d1dd4999991285 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 10 Oct 2016 10:40:53 -0700 Subject: [PATCH 0904/3220] FROMLIST: binder: move binder_alloc to separate file (from https://patchwork.kernel.org/patch/9817753/) Move the binder allocator functionality to its own file Continuation of splitting the binder allocator from the binder driver. Split binder_alloc functions from normal binder functions. Add kernel doc comments to functions declared extern in binder_alloc.h Change-Id: I8f1a967375359078b8e63c7b6b88a752c374a64a Signed-off-by: Todd Kjos --- drivers/android/Makefile | 2 +- drivers/android/binder.c | 763 +-------------------------------- drivers/android/binder_alloc.c | 759 ++++++++++++++++++++++++++++++++ drivers/android/binder_alloc.h | 162 +++++++ 4 files changed, 923 insertions(+), 763 deletions(-) create mode 100644 drivers/android/binder_alloc.c create mode 100644 drivers/android/binder_alloc.h diff --git a/drivers/android/Makefile b/drivers/android/Makefile index 3b7e4b072c58..4b7c726bb560 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -1,3 +1,3 @@ ccflags-y += -I$(src) # needed for trace events -obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o +obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o diff --git a/drivers/android/binder.c b/drivers/android/binder.c index e73481a8ea2c..3251b7cdd717 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -34,8 +33,6 @@ #include #include #include -#include -#include #include #include @@ -44,11 +41,11 @@ #endif #include +#include "binder_alloc.h" #include "binder_trace.h" static DEFINE_MUTEX(binder_main_lock); static DEFINE_MUTEX(binder_deferred_lock); -static DEFINE_MUTEX(binder_alloc_mmap_lock); static HLIST_HEAD(binder_devices); static HLIST_HEAD(binder_procs); @@ -157,27 +154,6 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error, #define to_binder_fd_array_object(hdr) \ container_of(hdr, struct binder_fd_array_object, hdr) -/* - * debug declarations for binder_alloc. To be - * moved to binder_alloc.c - */ -enum { - BINDER_ALLOC_DEBUG_OPEN_CLOSE = 1U << 1, - BINDER_ALLOC_DEBUG_BUFFER_ALLOC = 1U << 2, - BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 3, -}; -static uint32_t binder_alloc_debug_mask; - -module_param_named(alloc_debug_mask, binder_alloc_debug_mask, - uint, S_IWUSR | S_IRUGO); - -#define binder_alloc_debug(mask, x...) \ - do { \ - if (binder_alloc_debug_mask & mask) \ - pr_info(x); \ - } while (0) -/* end of binder_alloc debug declarations */ - enum binder_stat_types { BINDER_STAT_PROC, BINDER_STAT_THREAD, @@ -314,68 +290,12 @@ struct binder_ref { struct binder_ref_death *death; }; -struct binder_buffer { - struct list_head entry; /* free and allocated entries by address */ - struct rb_node rb_node; /* free entry by size or allocated entry */ - /* by address */ - unsigned free:1; - unsigned allow_user_free:1; - unsigned async_transaction:1; - unsigned debug_id:29; - - struct binder_transaction *transaction; - - struct binder_node *target_node; - size_t data_size; - size_t offsets_size; - size_t extra_buffers_size; - uint8_t data[0]; -}; - enum binder_deferred_state { BINDER_DEFERRED_PUT_FILES = 0x01, BINDER_DEFERRED_FLUSH = 0x02, BINDER_DEFERRED_RELEASE = 0x04, }; -/** - * struct binder_alloc - per-binder proc state for binder allocator - * @vma: vm_area_struct passed to mmap_handler - * (invarient after mmap) - * @vma_vm_mm: copy of vma->vm_mm (invarient after mmap) - * @buffer: base of per-proc address space mapped via mmap - * @user_buffer_offset: offset between user and kernel VAs for buffer - * @buffers: list of all buffers for this proc - * @free_buffers: rb tree of buffers available for allocation - * sorted by size - * @allocated_buffers: rb tree of allocated buffers sorted by address - * @free_async_space: VA space available for async buffers. This is - * initialized at mmap time to 1/2 the full VA space - * @pages: array of physical page addresses for each page of - * mmap'd space - * @buffer_size: size of address space (could be less than requested) - * - * Bookkeeping structure for per-proc address space management for binder - * buffers. It is normally initialized during binder_init() and binder_mmap() - * calls. The address space is used for both user-visible buffers and for - * struct binder_buffer objects used to track the user buffers - */ -struct binder_alloc { - struct mutex mutex; - struct task_struct *tsk; - struct vm_area_struct *vma; - struct mm_struct *vma_vm_mm; - void *buffer; - ptrdiff_t user_buffer_offset; - struct list_head buffers; - struct rb_root free_buffers; - struct rb_root allocated_buffers; - size_t free_async_space; - struct page **pages; - size_t buffer_size; - int pid; -}; - struct binder_proc { struct hlist_node proc_node; struct rb_root threads; @@ -445,56 +365,6 @@ struct binder_transaction { kuid_t sender_euid; }; -/* - * Forward declarations of binder_alloc functions. - * These will be moved to binder_alloc.h when - * binder_alloc is moved to its own files. - */ -extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, - size_t data_size, - size_t offsets_size, - size_t extra_buffers_size, - int is_async); -extern void binder_alloc_init(struct binder_alloc *alloc); -extern void binder_alloc_vma_close(struct binder_alloc *alloc); -extern struct binder_buffer * -binder_alloc_buffer_lookup(struct binder_alloc *alloc, - uintptr_t user_ptr); -extern void binder_alloc_free_buf(struct binder_alloc *alloc, - struct binder_buffer *buffer); -extern int binder_alloc_mmap_handler(struct binder_alloc *alloc, - struct vm_area_struct *vma); -extern void binder_alloc_deferred_release(struct binder_alloc *alloc); -extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc); -extern void binder_alloc_print_allocated(struct seq_file *m, - struct binder_alloc *alloc); - -static inline size_t -binder_alloc_get_free_async_space(struct binder_alloc *alloc) -{ - size_t free_async_space; - - mutex_lock(&alloc->mutex); - free_async_space = alloc->free_async_space; - mutex_unlock(&alloc->mutex); - return free_async_space; -} - -static inline ptrdiff_t -binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc) -{ - /* - * user_buffer_offset is constant if vma is set and - * undefined if vma is not set. It is possible to - * get here with !alloc->vma if the target process - * is dying while a transaction is being initiated. - * Returning the old value is ok in this case and - * the transaction will fail. - */ - return alloc->user_buffer_offset; -} -/* end of binder_alloc declarations */ - static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); @@ -578,462 +448,6 @@ static void binder_set_nice(long nice) binder_user_error("%d RLIMIT_NICE not set\n", current->pid); } -static size_t binder_alloc_buffer_size(struct binder_alloc *alloc, - struct binder_buffer *buffer) -{ - if (list_is_last(&buffer->entry, &alloc->buffers)) - return alloc->buffer + - alloc->buffer_size - (void *)buffer->data; - return (size_t)list_entry(buffer->entry.next, - struct binder_buffer, entry) - (size_t)buffer->data; -} - -static void binder_insert_free_buffer(struct binder_alloc *alloc, - struct binder_buffer *new_buffer) -{ - struct rb_node **p = &alloc->free_buffers.rb_node; - struct rb_node *parent = NULL; - struct binder_buffer *buffer; - size_t buffer_size; - size_t new_buffer_size; - - BUG_ON(!new_buffer->free); - - new_buffer_size = binder_alloc_buffer_size(alloc, new_buffer); - - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: add free buffer, size %zd, at %pK\n", - alloc->pid, new_buffer_size, new_buffer); - - while (*p) { - parent = *p; - buffer = rb_entry(parent, struct binder_buffer, rb_node); - BUG_ON(!buffer->free); - - buffer_size = binder_alloc_buffer_size(alloc, buffer); - - if (new_buffer_size < buffer_size) - p = &parent->rb_left; - else - p = &parent->rb_right; - } - rb_link_node(&new_buffer->rb_node, parent, p); - rb_insert_color(&new_buffer->rb_node, &alloc->free_buffers); -} - -static void binder_insert_allocated_buffer(struct binder_alloc *alloc, - struct binder_buffer *new_buffer) -{ - struct rb_node **p = &alloc->allocated_buffers.rb_node; - struct rb_node *parent = NULL; - struct binder_buffer *buffer; - - BUG_ON(new_buffer->free); - - while (*p) { - parent = *p; - buffer = rb_entry(parent, struct binder_buffer, rb_node); - BUG_ON(buffer->free); - - if (new_buffer < buffer) - p = &parent->rb_left; - else if (new_buffer > buffer) - p = &parent->rb_right; - else - BUG(); - } - rb_link_node(&new_buffer->rb_node, parent, p); - rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers); -} - -static struct binder_buffer *binder_alloc_buffer_lookup_locked( - struct binder_alloc *alloc, - uintptr_t user_ptr) -{ - struct rb_node *n = alloc->allocated_buffers.rb_node; - struct binder_buffer *buffer; - struct binder_buffer *kern_ptr; - - kern_ptr = (struct binder_buffer *)(user_ptr - alloc->user_buffer_offset - - offsetof(struct binder_buffer, data)); - - while (n) { - buffer = rb_entry(n, struct binder_buffer, rb_node); - BUG_ON(buffer->free); - - if (kern_ptr < buffer) - n = n->rb_left; - else if (kern_ptr > buffer) - n = n->rb_right; - else - return buffer; - } - return NULL; -} - -struct binder_buffer *binder_alloc_buffer_lookup(struct binder_alloc *alloc, - uintptr_t user_ptr) -{ - struct binder_buffer *buffer; - - mutex_lock(&alloc->mutex); - buffer = binder_alloc_buffer_lookup_locked(alloc, user_ptr); - mutex_unlock(&alloc->mutex); - return buffer; -} - -static int binder_update_page_range(struct binder_alloc *alloc, int allocate, - void *start, void *end, - struct vm_area_struct *vma) -{ - void *page_addr; - unsigned long user_page_addr; - struct page **page; - struct mm_struct *mm; - - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: %s pages %pK-%pK\n", alloc->pid, - allocate ? "allocate" : "free", start, end); - - if (end <= start) - return 0; - - trace_binder_update_page_range(alloc, allocate, start, end); - - if (vma) - mm = NULL; - else - mm = get_task_mm(alloc->tsk); - - if (mm) { - down_write(&mm->mmap_sem); - vma = alloc->vma; - if (vma && mm != alloc->vma_vm_mm) { - pr_err("%d: vma mm and task mm mismatch\n", - alloc->pid); - vma = NULL; - } - } - - if (allocate == 0) - goto free_range; - - if (vma == NULL) { - pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n", - alloc->pid); - goto err_no_vma; - } - - for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { - int ret; - - page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; - - BUG_ON(*page); - *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); - if (*page == NULL) { - pr_err("%d: binder_alloc_buf failed for page at %pK\n", - alloc->pid, page_addr); - goto err_alloc_page_failed; - } - ret = map_kernel_range_noflush((unsigned long)page_addr, - PAGE_SIZE, PAGE_KERNEL, page); - flush_cache_vmap((unsigned long)page_addr, - (unsigned long)page_addr + PAGE_SIZE); - if (ret != 1) { - pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n", - alloc->pid, page_addr); - goto err_map_kernel_failed; - } - user_page_addr = - (uintptr_t)page_addr + alloc->user_buffer_offset; - ret = vm_insert_page(vma, user_page_addr, page[0]); - if (ret) { - pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n", - alloc->pid, user_page_addr); - goto err_vm_insert_page_failed; - } - /* vm_insert_page does not seem to increment the refcount */ - } - if (mm) { - up_write(&mm->mmap_sem); - mmput(mm); - } - return 0; - -free_range: - for (page_addr = end - PAGE_SIZE; page_addr >= start; - page_addr -= PAGE_SIZE) { - page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; - if (vma) - zap_page_range(vma, (uintptr_t)page_addr + - alloc->user_buffer_offset, PAGE_SIZE, NULL); -err_vm_insert_page_failed: - unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); -err_map_kernel_failed: - __free_page(*page); - *page = NULL; -err_alloc_page_failed: - ; - } -err_no_vma: - if (mm) { - up_write(&mm->mmap_sem); - mmput(mm); - } - return -ENOMEM; -} - -static struct binder_buffer *binder_alloc_new_buf_locked( - struct binder_alloc *alloc, size_t data_size, - size_t offsets_size, size_t extra_buffers_size, int is_async) -{ - struct rb_node *n = alloc->free_buffers.rb_node; - struct binder_buffer *buffer; - size_t buffer_size; - struct rb_node *best_fit = NULL; - void *has_page_addr; - void *end_page_addr; - size_t size, data_offsets_size; - - if (alloc->vma == NULL) { - pr_err("%d: binder_alloc_buf, no vma\n", - alloc->pid); - return NULL; - } - - data_offsets_size = ALIGN(data_size, sizeof(void *)) + - ALIGN(offsets_size, sizeof(void *)); - - if (data_offsets_size < data_size || data_offsets_size < offsets_size) { - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: got transaction with invalid size %zd-%zd\n", - alloc->pid, data_size, offsets_size); - return NULL; - } - size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *)); - if (size < data_offsets_size || size < extra_buffers_size) { - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: got transaction with invalid extra_buffers_size %zd\n", - alloc->pid, extra_buffers_size); - return NULL; - } - if (is_async && - alloc->free_async_space < size + sizeof(struct binder_buffer)) { - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: binder_alloc_buf size %zd failed, no async space left\n", - alloc->pid, size); - return NULL; - } - - while (n) { - buffer = rb_entry(n, struct binder_buffer, rb_node); - BUG_ON(!buffer->free); - buffer_size = binder_alloc_buffer_size(alloc, buffer); - - if (size < buffer_size) { - best_fit = n; - n = n->rb_left; - } else if (size > buffer_size) - n = n->rb_right; - else { - best_fit = n; - break; - } - } - if (best_fit == NULL) { - pr_err("%d: binder_alloc_buf size %zd failed, no address space\n", - alloc->pid, size); - return NULL; - } - if (n == NULL) { - buffer = rb_entry(best_fit, struct binder_buffer, rb_node); - buffer_size = binder_alloc_buffer_size(alloc, buffer); - } - - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n", - alloc->pid, size, buffer, buffer_size); - - has_page_addr = - (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK); - if (n == NULL) { - if (size + sizeof(struct binder_buffer) + 4 >= buffer_size) - buffer_size = size; /* no room for other buffers */ - else - buffer_size = size + sizeof(struct binder_buffer); - } - end_page_addr = - (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size); - if (end_page_addr > has_page_addr) - end_page_addr = has_page_addr; - if (binder_update_page_range(alloc, 1, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL)) - return NULL; - - rb_erase(best_fit, &alloc->free_buffers); - buffer->free = 0; - binder_insert_allocated_buffer(alloc, buffer); - if (buffer_size != size) { - struct binder_buffer *new_buffer = (void *)buffer->data + size; - - list_add(&new_buffer->entry, &buffer->entry); - new_buffer->free = 1; - binder_insert_free_buffer(alloc, new_buffer); - } - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: binder_alloc_buf size %zd got %pK\n", - alloc->pid, size, buffer); - buffer->data_size = data_size; - buffer->offsets_size = offsets_size; - buffer->async_transaction = is_async; - buffer->extra_buffers_size = extra_buffers_size; - if (is_async) { - alloc->free_async_space -= size + sizeof(struct binder_buffer); - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC, - "%d: binder_alloc_buf size %zd async free %zd\n", - alloc->pid, size, alloc->free_async_space); - } - - return buffer; -} - -struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, - size_t data_size, - size_t offsets_size, - size_t extra_buffers_size, - int is_async) -{ - struct binder_buffer *buffer; - - mutex_lock(&alloc->mutex); - buffer = binder_alloc_new_buf_locked(alloc, data_size, offsets_size, - extra_buffers_size, is_async); - mutex_unlock(&alloc->mutex); - return buffer; -} - -static void *buffer_start_page(struct binder_buffer *buffer) -{ - return (void *)((uintptr_t)buffer & PAGE_MASK); -} - -static void *buffer_end_page(struct binder_buffer *buffer) -{ - return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK); -} - -static void binder_delete_free_buffer(struct binder_alloc *alloc, - struct binder_buffer *buffer) -{ - struct binder_buffer *prev, *next = NULL; - int free_page_end = 1; - int free_page_start = 1; - - BUG_ON(alloc->buffers.next == &buffer->entry); - prev = list_entry(buffer->entry.prev, struct binder_buffer, entry); - BUG_ON(!prev->free); - if (buffer_end_page(prev) == buffer_start_page(buffer)) { - free_page_start = 0; - if (buffer_end_page(prev) == buffer_end_page(buffer)) - free_page_end = 0; - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %pK share page with %pK\n", - alloc->pid, buffer, prev); - } - - if (!list_is_last(&buffer->entry, &alloc->buffers)) { - next = list_entry(buffer->entry.next, - struct binder_buffer, entry); - if (buffer_start_page(next) == buffer_end_page(buffer)) { - free_page_end = 0; - if (buffer_start_page(next) == - buffer_start_page(buffer)) - free_page_start = 0; - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %pK share page with %pK\n", - alloc->pid, buffer, prev); - } - } - list_del(&buffer->entry); - if (free_page_start || free_page_end) { - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %pK do not share page%s%s with %pK or %pK\n", - alloc->pid, buffer, free_page_start ? "" : " end", - free_page_end ? "" : " start", prev, next); - binder_update_page_range(alloc, 0, free_page_start ? - buffer_start_page(buffer) : buffer_end_page(buffer), - (free_page_end ? buffer_end_page(buffer) : - buffer_start_page(buffer)) + PAGE_SIZE, NULL); - } -} - -static void binder_free_buf_locked(struct binder_alloc *alloc, - struct binder_buffer *buffer) -{ - size_t size, buffer_size; - - buffer_size = binder_alloc_buffer_size(alloc, buffer); - - size = ALIGN(buffer->data_size, sizeof(void *)) + - ALIGN(buffer->offsets_size, sizeof(void *)) + - ALIGN(buffer->extra_buffers_size, sizeof(void *)); - - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%d: binder_free_buf %pK size %zd buffer_size %zd\n", - alloc->pid, buffer, size, buffer_size); - - BUG_ON(buffer->free); - BUG_ON(size > buffer_size); - BUG_ON(buffer->transaction != NULL); - BUG_ON((void *)buffer < alloc->buffer); - BUG_ON((void *)buffer > alloc->buffer + alloc->buffer_size); - - if (buffer->async_transaction) { - alloc->free_async_space += size + sizeof(struct binder_buffer); - - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC_ASYNC, - "%d: binder_free_buf size %zd async free %zd\n", - alloc->pid, size, alloc->free_async_space); - } - - binder_update_page_range(alloc, 0, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), - (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK), - NULL); - - rb_erase(&buffer->rb_node, &alloc->allocated_buffers); - buffer->free = 1; - if (!list_is_last(&buffer->entry, &alloc->buffers)) { - struct binder_buffer *next = list_entry(buffer->entry.next, - struct binder_buffer, entry); - - if (next->free) { - rb_erase(&next->rb_node, &alloc->free_buffers); - binder_delete_free_buffer(alloc, next); - } - } - if (alloc->buffers.next != &buffer->entry) { - struct binder_buffer *prev = list_entry(buffer->entry.prev, - struct binder_buffer, entry); - - if (prev->free) { - binder_delete_free_buffer(alloc, buffer); - rb_erase(&prev->rb_node, &alloc->free_buffers); - buffer = prev; - } - } - binder_insert_free_buffer(alloc, buffer); -} - -void binder_alloc_free_buf(struct binder_alloc *alloc, - struct binder_buffer *buffer) -{ - mutex_lock(&alloc->mutex); - binder_free_buf_locked(alloc, buffer); - mutex_unlock(&alloc->mutex); -} - static struct binder_node *binder_get_node(struct binder_proc *proc, binder_uintptr_t ptr) { @@ -3468,12 +2882,6 @@ static void binder_vma_open(struct vm_area_struct *vma) (unsigned long)pgprot_val(vma->vm_page_prot)); } -void binder_alloc_vma_close(struct binder_alloc *alloc) -{ - WRITE_ONCE(alloc->vma, NULL); - WRITE_ONCE(alloc->vma_vm_mm, NULL); -} - static void binder_vma_close(struct vm_area_struct *vma) { struct binder_proc *proc = vma->vm_private_data; @@ -3498,86 +2906,6 @@ static const struct vm_operations_struct binder_vm_ops = { .fault = binder_vm_fault, }; -int binder_alloc_mmap_handler(struct binder_alloc *alloc, - struct vm_area_struct *vma) -{ - int ret; - struct vm_struct *area; - const char *failure_string; - struct binder_buffer *buffer; - - mutex_lock(&binder_alloc_mmap_lock); - if (alloc->buffer) { - ret = -EBUSY; - failure_string = "already mapped"; - goto err_already_mapped; - } - - area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP); - if (area == NULL) { - ret = -ENOMEM; - failure_string = "get_vm_area"; - goto err_get_vm_area_failed; - } - alloc->buffer = area->addr; - alloc->user_buffer_offset = - vma->vm_start - (uintptr_t)alloc->buffer; - mutex_unlock(&binder_alloc_mmap_lock); - -#ifdef CONFIG_CPU_CACHE_VIPT - if (cache_is_vipt_aliasing()) { - while (CACHE_COLOUR( - (vma->vm_start ^ (uint32_t)alloc->buffer))) { - pr_info("binder_mmap: %d %lx-%lx maps %pK bad alignment\n", - alloc->pid, vma->vm_start, vma->vm_end, - alloc->buffer); - vma->vm_start += PAGE_SIZE; - } - } -#endif - alloc->pages = kzalloc(sizeof(alloc->pages[0]) * - ((vma->vm_end - vma->vm_start) / PAGE_SIZE), - GFP_KERNEL); - if (alloc->pages == NULL) { - ret = -ENOMEM; - failure_string = "alloc page array"; - goto err_alloc_pages_failed; - } - alloc->buffer_size = vma->vm_end - vma->vm_start; - - if (binder_update_page_range(alloc, 1, alloc->buffer, - alloc->buffer + PAGE_SIZE, vma)) { - ret = -ENOMEM; - failure_string = "alloc small buf"; - goto err_alloc_small_buf_failed; - } - buffer = alloc->buffer; - INIT_LIST_HEAD(&alloc->buffers); - list_add(&buffer->entry, &alloc->buffers); - buffer->free = 1; - binder_insert_free_buffer(alloc, buffer); - alloc->free_async_space = alloc->buffer_size / 2; - barrier(); - alloc->vma = vma; - alloc->vma_vm_mm = vma->vm_mm; - - return 0; - -err_alloc_small_buf_failed: - kfree(alloc->pages); - alloc->pages = NULL; -err_alloc_pages_failed: - mutex_lock(&binder_alloc_mmap_lock); - vfree(alloc->buffer); - alloc->buffer = NULL; -err_get_vm_area_failed: -err_already_mapped: - mutex_unlock(&binder_alloc_mmap_lock); - pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, - alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret); - return ret; -} - static int binder_mmap(struct file *filp, struct vm_area_struct *vma) { int ret; @@ -3617,13 +2945,6 @@ err_bad_arg: return ret; } -void binder_alloc_init(struct binder_alloc *alloc) -{ - alloc->tsk = current->group_leader; - alloc->pid = current->group_leader->pid; - mutex_init(&alloc->mutex); -} - static int binder_open(struct inode *nodp, struct file *filp) { struct binder_proc *proc; @@ -3759,55 +3080,6 @@ static int binder_node_release(struct binder_node *node, int refs) return refs; } -void binder_alloc_deferred_release(struct binder_alloc *alloc) -{ - struct rb_node *n; - int buffers, page_count; - - BUG_ON(alloc->vma); - - buffers = 0; - mutex_lock(&alloc->mutex); - while ((n = rb_first(&alloc->allocated_buffers))) { - struct binder_buffer *buffer; - - buffer = rb_entry(n, struct binder_buffer, rb_node); - - /* Transaction should already have been freed */ - BUG_ON(buffer->transaction); - - binder_free_buf_locked(alloc, buffer); - buffers++; - } - - page_count = 0; - if (alloc->pages) { - int i; - - for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { - void *page_addr; - - if (!alloc->pages[i]) - continue; - - page_addr = alloc->buffer + i * PAGE_SIZE; - binder_alloc_debug(BINDER_ALLOC_DEBUG_BUFFER_ALLOC, - "%s: %d: page %d at %pK not freed\n", - __func__, alloc->pid, i, page_addr); - unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); - __free_page(alloc->pages[i]); - page_count++; - } - kfree(alloc->pages); - vfree(alloc->buffer); - } - mutex_unlock(&alloc->mutex); - - binder_alloc_debug(BINDER_ALLOC_DEBUG_OPEN_CLOSE, - "%s: %d buffers %d, pages %d\n", - __func__, alloc->pid, buffers, page_count); -} - static void binder_deferred_release(struct binder_proc *proc) { struct binder_context *context = proc->context; @@ -4052,27 +3324,6 @@ static void print_binder_ref(struct seq_file *m, struct binder_ref *ref) ref->node->debug_id, ref->strong, ref->weak, ref->death); } -static void print_binder_buffer(struct seq_file *m, const char *prefix, - struct binder_buffer *buffer) -{ - seq_printf(m, "%s %d: %pK size %zd:%zd %s\n", - prefix, buffer->debug_id, buffer->data, - buffer->data_size, buffer->offsets_size, - buffer->transaction ? "active" : "delivered"); -} - -void binder_alloc_print_allocated(struct seq_file *m, - struct binder_alloc *alloc) -{ - struct rb_node *n; - - mutex_lock(&alloc->mutex); - for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n)) - print_binder_buffer(m, " buffer", - rb_entry(n, struct binder_buffer, rb_node)); - mutex_unlock(&alloc->mutex); -} - static void print_binder_proc(struct seq_file *m, struct binder_proc *proc, int print_all) { @@ -4199,18 +3450,6 @@ static void print_binder_stats(struct seq_file *m, const char *prefix, } } -int binder_alloc_get_allocated_count(struct binder_alloc *alloc) -{ - struct rb_node *n; - int count = 0; - - mutex_lock(&alloc->mutex); - for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n)) - count++; - mutex_unlock(&alloc->mutex); - return count; -} - static void print_binder_proc_stats(struct seq_file *m, struct binder_proc *proc) { diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c new file mode 100644 index 000000000000..eaef8f8fb859 --- /dev/null +++ b/drivers/android/binder_alloc.c @@ -0,0 +1,759 @@ +/* binder_alloc.c + * + * Android IPC Subsystem + * + * Copyright (C) 2007-2017 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "binder_alloc.h" +#include "binder_trace.h" + +static DEFINE_MUTEX(binder_alloc_mmap_lock); + +enum { + BINDER_DEBUG_OPEN_CLOSE = 1U << 1, + BINDER_DEBUG_BUFFER_ALLOC = 1U << 2, + BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 3, +}; +static uint32_t binder_alloc_debug_mask; + +module_param_named(debug_mask, binder_alloc_debug_mask, + uint, S_IWUSR | S_IRUGO); + +#define binder_alloc_debug(mask, x...) \ + do { \ + if (binder_alloc_debug_mask & mask) \ + pr_info(x); \ + } while (0) + +static size_t binder_alloc_buffer_size(struct binder_alloc *alloc, + struct binder_buffer *buffer) +{ + if (list_is_last(&buffer->entry, &alloc->buffers)) + return alloc->buffer + + alloc->buffer_size - (void *)buffer->data; + return (size_t)list_entry(buffer->entry.next, + struct binder_buffer, entry) - (size_t)buffer->data; +} + +static void binder_insert_free_buffer(struct binder_alloc *alloc, + struct binder_buffer *new_buffer) +{ + struct rb_node **p = &alloc->free_buffers.rb_node; + struct rb_node *parent = NULL; + struct binder_buffer *buffer; + size_t buffer_size; + size_t new_buffer_size; + + BUG_ON(!new_buffer->free); + + new_buffer_size = binder_alloc_buffer_size(alloc, new_buffer); + + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: add free buffer, size %zd, at %pK\n", + alloc->pid, new_buffer_size, new_buffer); + + while (*p) { + parent = *p; + buffer = rb_entry(parent, struct binder_buffer, rb_node); + BUG_ON(!buffer->free); + + buffer_size = binder_alloc_buffer_size(alloc, buffer); + + if (new_buffer_size < buffer_size) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&new_buffer->rb_node, parent, p); + rb_insert_color(&new_buffer->rb_node, &alloc->free_buffers); +} + +static void binder_insert_allocated_buffer_locked( + struct binder_alloc *alloc, struct binder_buffer *new_buffer) +{ + struct rb_node **p = &alloc->allocated_buffers.rb_node; + struct rb_node *parent = NULL; + struct binder_buffer *buffer; + + BUG_ON(new_buffer->free); + + while (*p) { + parent = *p; + buffer = rb_entry(parent, struct binder_buffer, rb_node); + BUG_ON(buffer->free); + + if (new_buffer < buffer) + p = &parent->rb_left; + else if (new_buffer > buffer) + p = &parent->rb_right; + else + BUG(); + } + rb_link_node(&new_buffer->rb_node, parent, p); + rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers); +} + +static struct binder_buffer *binder_alloc_buffer_lookup_locked( + struct binder_alloc *alloc, + uintptr_t user_ptr) +{ + struct rb_node *n = alloc->allocated_buffers.rb_node; + struct binder_buffer *buffer; + struct binder_buffer *kern_ptr; + + kern_ptr = (struct binder_buffer *)(user_ptr - alloc->user_buffer_offset + - offsetof(struct binder_buffer, data)); + + while (n) { + buffer = rb_entry(n, struct binder_buffer, rb_node); + BUG_ON(buffer->free); + + if (kern_ptr < buffer) + n = n->rb_left; + else if (kern_ptr > buffer) + n = n->rb_right; + else + return buffer; + } + return NULL; +} + +/** + * binder_alloc_buffer_lookup() - get buffer given user ptr + * @alloc: binder_alloc for this proc + * @user_ptr: User pointer to buffer data + * + * Validate userspace pointer to buffer data and return buffer corresponding to + * that user pointer. Search the rb tree for buffer that matches user data + * pointer. + * + * Return: Pointer to buffer or NULL + */ +struct binder_buffer *binder_alloc_buffer_lookup(struct binder_alloc *alloc, + uintptr_t user_ptr) +{ + struct binder_buffer *buffer; + + mutex_lock(&alloc->mutex); + buffer = binder_alloc_buffer_lookup_locked(alloc, user_ptr); + mutex_unlock(&alloc->mutex); + return buffer; +} + +static int binder_update_page_range(struct binder_alloc *alloc, int allocate, + void *start, void *end, + struct vm_area_struct *vma) +{ + void *page_addr; + unsigned long user_page_addr; + struct page **page; + struct mm_struct *mm; + + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: %s pages %pK-%pK\n", alloc->pid, + allocate ? "allocate" : "free", start, end); + + if (end <= start) + return 0; + + trace_binder_update_page_range(alloc, allocate, start, end); + + if (vma) + mm = NULL; + else + mm = get_task_mm(alloc->tsk); + + if (mm) { + down_write(&mm->mmap_sem); + vma = alloc->vma; + if (vma && mm != alloc->vma_vm_mm) { + pr_err("%d: vma mm and task mm mismatch\n", + alloc->pid); + vma = NULL; + } + } + + if (allocate == 0) + goto free_range; + + if (vma == NULL) { + pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n", + alloc->pid); + goto err_no_vma; + } + + for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { + int ret; + + page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; + + BUG_ON(*page); + *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + if (*page == NULL) { + pr_err("%d: binder_alloc_buf failed for page at %pK\n", + alloc->pid, page_addr); + goto err_alloc_page_failed; + } + ret = map_kernel_range_noflush((unsigned long)page_addr, + PAGE_SIZE, PAGE_KERNEL, page); + flush_cache_vmap((unsigned long)page_addr, + (unsigned long)page_addr + PAGE_SIZE); + if (ret != 1) { + pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n", + alloc->pid, page_addr); + goto err_map_kernel_failed; + } + user_page_addr = + (uintptr_t)page_addr + alloc->user_buffer_offset; + ret = vm_insert_page(vma, user_page_addr, page[0]); + if (ret) { + pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n", + alloc->pid, user_page_addr); + goto err_vm_insert_page_failed; + } + /* vm_insert_page does not seem to increment the refcount */ + } + if (mm) { + up_write(&mm->mmap_sem); + mmput(mm); + } + return 0; + +free_range: + for (page_addr = end - PAGE_SIZE; page_addr >= start; + page_addr -= PAGE_SIZE) { + page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; + if (vma) + zap_page_range(vma, (uintptr_t)page_addr + + alloc->user_buffer_offset, PAGE_SIZE, NULL); +err_vm_insert_page_failed: + unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); +err_map_kernel_failed: + __free_page(*page); + *page = NULL; +err_alloc_page_failed: + ; + } +err_no_vma: + if (mm) { + up_write(&mm->mmap_sem); + mmput(mm); + } + return -ENOMEM; +} + +struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, + size_t data_size, + size_t offsets_size, + size_t extra_buffers_size, + int is_async) +{ + struct rb_node *n = alloc->free_buffers.rb_node; + struct binder_buffer *buffer; + size_t buffer_size; + struct rb_node *best_fit = NULL; + void *has_page_addr; + void *end_page_addr; + size_t size, data_offsets_size; + + if (alloc->vma == NULL) { + pr_err("%d: binder_alloc_buf, no vma\n", + alloc->pid); + return NULL; + } + + data_offsets_size = ALIGN(data_size, sizeof(void *)) + + ALIGN(offsets_size, sizeof(void *)); + + if (data_offsets_size < data_size || data_offsets_size < offsets_size) { + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: got transaction with invalid size %zd-%zd\n", + alloc->pid, data_size, offsets_size); + return NULL; + } + size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *)); + if (size < data_offsets_size || size < extra_buffers_size) { + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: got transaction with invalid extra_buffers_size %zd\n", + alloc->pid, extra_buffers_size); + return NULL; + } + if (is_async && + alloc->free_async_space < size + sizeof(struct binder_buffer)) { + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: binder_alloc_buf size %zd failed, no async space left\n", + alloc->pid, size); + return NULL; + } + + while (n) { + buffer = rb_entry(n, struct binder_buffer, rb_node); + BUG_ON(!buffer->free); + buffer_size = binder_alloc_buffer_size(alloc, buffer); + + if (size < buffer_size) { + best_fit = n; + n = n->rb_left; + } else if (size > buffer_size) + n = n->rb_right; + else { + best_fit = n; + break; + } + } + if (best_fit == NULL) { + pr_err("%d: binder_alloc_buf size %zd failed, no address space\n", + alloc->pid, size); + return NULL; + } + if (n == NULL) { + buffer = rb_entry(best_fit, struct binder_buffer, rb_node); + buffer_size = binder_alloc_buffer_size(alloc, buffer); + } + + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n", + alloc->pid, size, buffer, buffer_size); + + has_page_addr = + (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK); + if (n == NULL) { + if (size + sizeof(struct binder_buffer) + 4 >= buffer_size) + buffer_size = size; /* no room for other buffers */ + else + buffer_size = size + sizeof(struct binder_buffer); + } + end_page_addr = + (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size); + if (end_page_addr > has_page_addr) + end_page_addr = has_page_addr; + if (binder_update_page_range(alloc, 1, + (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL)) + return NULL; + + rb_erase(best_fit, &alloc->free_buffers); + buffer->free = 0; + binder_insert_allocated_buffer_locked(alloc, buffer); + if (buffer_size != size) { + struct binder_buffer *new_buffer = (void *)buffer->data + size; + + list_add(&new_buffer->entry, &buffer->entry); + new_buffer->free = 1; + binder_insert_free_buffer(alloc, new_buffer); + } + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: binder_alloc_buf size %zd got %pK\n", + alloc->pid, size, buffer); + buffer->data_size = data_size; + buffer->offsets_size = offsets_size; + buffer->async_transaction = is_async; + buffer->extra_buffers_size = extra_buffers_size; + if (is_async) { + alloc->free_async_space -= size + sizeof(struct binder_buffer); + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, + "%d: binder_alloc_buf size %zd async free %zd\n", + alloc->pid, size, alloc->free_async_space); + } + return buffer; +} + +/** + * binder_alloc_new_buf() - Allocate a new binder buffer + * @alloc: binder_alloc for this proc + * @data_size: size of user data buffer + * @offsets_size: user specified buffer offset + * @extra_buffers_size: size of extra space for meta-data (eg, security context) + * @is_async: buffer for async transaction + * + * Allocate a new buffer given the requested sizes. Returns + * the kernel version of the buffer pointer. The size allocated + * is the sum of the three given sizes (each rounded up to + * pointer-sized boundary) + * + * Return: The allocated buffer or %NULL if error + */ +struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, + size_t data_size, + size_t offsets_size, + size_t extra_buffers_size, + int is_async) +{ + struct binder_buffer *buffer; + + mutex_lock(&alloc->mutex); + buffer = binder_alloc_new_buf_locked(alloc, data_size, offsets_size, + extra_buffers_size, is_async); + mutex_unlock(&alloc->mutex); + return buffer; +} + +static void *buffer_start_page(struct binder_buffer *buffer) +{ + return (void *)((uintptr_t)buffer & PAGE_MASK); +} + +static void *buffer_end_page(struct binder_buffer *buffer) +{ + return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK); +} + +static void binder_delete_free_buffer(struct binder_alloc *alloc, + struct binder_buffer *buffer) +{ + struct binder_buffer *prev, *next = NULL; + int free_page_end = 1; + int free_page_start = 1; + + BUG_ON(alloc->buffers.next == &buffer->entry); + prev = list_entry(buffer->entry.prev, struct binder_buffer, entry); + BUG_ON(!prev->free); + if (buffer_end_page(prev) == buffer_start_page(buffer)) { + free_page_start = 0; + if (buffer_end_page(prev) == buffer_end_page(buffer)) + free_page_end = 0; + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %pK share page with %pK\n", + alloc->pid, buffer, prev); + } + + if (!list_is_last(&buffer->entry, &alloc->buffers)) { + next = list_entry(buffer->entry.next, + struct binder_buffer, entry); + if (buffer_start_page(next) == buffer_end_page(buffer)) { + free_page_end = 0; + if (buffer_start_page(next) == + buffer_start_page(buffer)) + free_page_start = 0; + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %pK share page with %pK\n", + alloc->pid, buffer, prev); + } + } + list_del(&buffer->entry); + if (free_page_start || free_page_end) { + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %pK do not share page%s%s with %pK or %pK\n", + alloc->pid, buffer, free_page_start ? "" : " end", + free_page_end ? "" : " start", prev, next); + binder_update_page_range(alloc, 0, free_page_start ? + buffer_start_page(buffer) : buffer_end_page(buffer), + (free_page_end ? buffer_end_page(buffer) : + buffer_start_page(buffer)) + PAGE_SIZE, NULL); + } +} + +static void binder_free_buf_locked(struct binder_alloc *alloc, + struct binder_buffer *buffer) +{ + size_t size, buffer_size; + + buffer_size = binder_alloc_buffer_size(alloc, buffer); + + size = ALIGN(buffer->data_size, sizeof(void *)) + + ALIGN(buffer->offsets_size, sizeof(void *)) + + ALIGN(buffer->extra_buffers_size, sizeof(void *)); + + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: binder_free_buf %pK size %zd buffer_size %zd\n", + alloc->pid, buffer, size, buffer_size); + + BUG_ON(buffer->free); + BUG_ON(size > buffer_size); + BUG_ON(buffer->transaction != NULL); + BUG_ON((void *)buffer < alloc->buffer); + BUG_ON((void *)buffer > alloc->buffer + alloc->buffer_size); + + if (buffer->async_transaction) { + alloc->free_async_space += size + sizeof(struct binder_buffer); + + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, + "%d: binder_free_buf size %zd async free %zd\n", + alloc->pid, size, alloc->free_async_space); + } + + binder_update_page_range(alloc, 0, + (void *)PAGE_ALIGN((uintptr_t)buffer->data), + (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK), + NULL); + + rb_erase(&buffer->rb_node, &alloc->allocated_buffers); + buffer->free = 1; + if (!list_is_last(&buffer->entry, &alloc->buffers)) { + struct binder_buffer *next = list_entry(buffer->entry.next, + struct binder_buffer, entry); + + if (next->free) { + rb_erase(&next->rb_node, &alloc->free_buffers); + binder_delete_free_buffer(alloc, next); + } + } + if (alloc->buffers.next != &buffer->entry) { + struct binder_buffer *prev = list_entry(buffer->entry.prev, + struct binder_buffer, entry); + + if (prev->free) { + binder_delete_free_buffer(alloc, buffer); + rb_erase(&prev->rb_node, &alloc->free_buffers); + buffer = prev; + } + } + binder_insert_free_buffer(alloc, buffer); +} + +/** + * binder_alloc_free_buf() - free a binder buffer + * @alloc: binder_alloc for this proc + * @buffer: kernel pointer to buffer + * + * Free the buffer allocated via binder_alloc_new_buffer() + */ +void binder_alloc_free_buf(struct binder_alloc *alloc, + struct binder_buffer *buffer) +{ + mutex_lock(&alloc->mutex); + binder_free_buf_locked(alloc, buffer); + mutex_unlock(&alloc->mutex); +} + +/** + * binder_alloc_mmap_handler() - map virtual address space for proc + * @alloc: alloc structure for this proc + * @vma: vma passed to mmap() + * + * Called by binder_mmap() to initialize the space specified in + * vma for allocating binder buffers + * + * Return: + * 0 = success + * -EBUSY = address space already mapped + * -ENOMEM = failed to map memory to given address space + */ +int binder_alloc_mmap_handler(struct binder_alloc *alloc, + struct vm_area_struct *vma) +{ + int ret; + struct vm_struct *area; + const char *failure_string; + struct binder_buffer *buffer; + + mutex_lock(&binder_alloc_mmap_lock); + if (alloc->buffer) { + ret = -EBUSY; + failure_string = "already mapped"; + goto err_already_mapped; + } + + area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP); + if (area == NULL) { + ret = -ENOMEM; + failure_string = "get_vm_area"; + goto err_get_vm_area_failed; + } + alloc->buffer = area->addr; + alloc->user_buffer_offset = + vma->vm_start - (uintptr_t)alloc->buffer; + mutex_unlock(&binder_alloc_mmap_lock); + +#ifdef CONFIG_CPU_CACHE_VIPT + if (cache_is_vipt_aliasing()) { + while (CACHE_COLOUR( + (vma->vm_start ^ (uint32_t)alloc->buffer))) { + pr_info("binder_mmap: %d %lx-%lx maps %pK bad alignment\n", + alloc->pid, vma->vm_start, vma->vm_end, + alloc->buffer); + vma->vm_start += PAGE_SIZE; + } + } +#endif + alloc->pages = kzalloc(sizeof(alloc->pages[0]) * + ((vma->vm_end - vma->vm_start) / PAGE_SIZE), + GFP_KERNEL); + if (alloc->pages == NULL) { + ret = -ENOMEM; + failure_string = "alloc page array"; + goto err_alloc_pages_failed; + } + alloc->buffer_size = vma->vm_end - vma->vm_start; + + if (binder_update_page_range(alloc, 1, alloc->buffer, + alloc->buffer + PAGE_SIZE, vma)) { + ret = -ENOMEM; + failure_string = "alloc small buf"; + goto err_alloc_small_buf_failed; + } + buffer = alloc->buffer; + INIT_LIST_HEAD(&alloc->buffers); + list_add(&buffer->entry, &alloc->buffers); + buffer->free = 1; + binder_insert_free_buffer(alloc, buffer); + alloc->free_async_space = alloc->buffer_size / 2; + barrier(); + alloc->vma = vma; + alloc->vma_vm_mm = vma->vm_mm; + + return 0; + +err_alloc_small_buf_failed: + kfree(alloc->pages); + alloc->pages = NULL; +err_alloc_pages_failed: + mutex_lock(&binder_alloc_mmap_lock); + vfree(alloc->buffer); + alloc->buffer = NULL; +err_get_vm_area_failed: +err_already_mapped: + mutex_unlock(&binder_alloc_mmap_lock); + pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, + alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret); + return ret; +} + + +void binder_alloc_deferred_release(struct binder_alloc *alloc) +{ + struct rb_node *n; + int buffers, page_count; + + BUG_ON(alloc->vma); + + buffers = 0; + mutex_lock(&alloc->mutex); + while ((n = rb_first(&alloc->allocated_buffers))) { + struct binder_buffer *buffer; + + buffer = rb_entry(n, struct binder_buffer, rb_node); + + /* Transaction should already have been freed */ + BUG_ON(buffer->transaction); + + binder_free_buf_locked(alloc, buffer); + buffers++; + } + + page_count = 0; + if (alloc->pages) { + int i; + + for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { + void *page_addr; + + if (!alloc->pages[i]) + continue; + + page_addr = alloc->buffer + i * PAGE_SIZE; + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%s: %d: page %d at %pK not freed\n", + __func__, alloc->pid, i, page_addr); + unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); + __free_page(alloc->pages[i]); + page_count++; + } + kfree(alloc->pages); + vfree(alloc->buffer); + } + mutex_unlock(&alloc->mutex); + + binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE, + "%s: %d buffers %d, pages %d\n", + __func__, alloc->pid, buffers, page_count); +} + +static void print_binder_buffer(struct seq_file *m, const char *prefix, + struct binder_buffer *buffer) +{ + seq_printf(m, "%s %d: %pK size %zd:%zd %s\n", + prefix, buffer->debug_id, buffer->data, + buffer->data_size, buffer->offsets_size, + buffer->transaction ? "active" : "delivered"); +} + +/** + * binder_alloc_print_allocated() - print buffer info + * @m: seq_file for output via seq_printf() + * @alloc: binder_alloc for this proc + * + * Prints information about every buffer associated with + * the binder_alloc state to the given seq_file + */ +void binder_alloc_print_allocated(struct seq_file *m, + struct binder_alloc *alloc) +{ + struct rb_node *n; + + mutex_lock(&alloc->mutex); + for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n)) + print_binder_buffer(m, " buffer", + rb_entry(n, struct binder_buffer, rb_node)); + mutex_unlock(&alloc->mutex); +} + +/** + * binder_alloc_get_allocated_count() - return count of buffers + * @alloc: binder_alloc for this proc + * + * Return: count of allocated buffers + */ +int binder_alloc_get_allocated_count(struct binder_alloc *alloc) +{ + struct rb_node *n; + int count = 0; + + mutex_lock(&alloc->mutex); + for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n)) + count++; + mutex_unlock(&alloc->mutex); + return count; +} + + +/** + * binder_alloc_vma_close() - invalidate address space + * @alloc: binder_alloc for this proc + * + * Called from binder_vma_close() when releasing address space. + * Clears alloc->vma to prevent new incoming transactions from + * allocating more buffers. + */ +void binder_alloc_vma_close(struct binder_alloc *alloc) +{ + WRITE_ONCE(alloc->vma, NULL); + WRITE_ONCE(alloc->vma_vm_mm, NULL); +} + +/** + * binder_alloc_init() - called by binder_open() for per-proc initialization + * @alloc: binder_alloc for this proc + * + * Called from binder_open() to initialize binder_alloc fields for + * new binder proc + */ +void binder_alloc_init(struct binder_alloc *alloc) +{ + alloc->tsk = current->group_leader; + alloc->pid = current->group_leader->pid; + mutex_init(&alloc->mutex); +} + diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h new file mode 100644 index 000000000000..721c511431f9 --- /dev/null +++ b/drivers/android/binder_alloc.h @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2017 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef _LINUX_BINDER_ALLOC_H +#define _LINUX_BINDER_ALLOC_H + +#include +#include +#include +#include +#include +#include + +struct binder_transaction; + +/** + * struct binder_buffer - buffer used for binder transactions + * @entry: entry alloc->buffers + * @rb_node: node for allocated_buffers/free_buffers rb trees + * @free: true if buffer is free + * @allow_user_free: describe the second member of struct blah, + * @async_transaction: describe the second member of struct blah, + * @debug_id: describe the second member of struct blah, + * @transaction: describe the second member of struct blah, + * @target_node: describe the second member of struct blah, + * @data_size: describe the second member of struct blah, + * @offsets_size: describe the second member of struct blah, + * @extra_buffers_size: describe the second member of struct blah, + * @data:i describe the second member of struct blah, + * + * Bookkeeping structure for binder transaction buffers + */ +struct binder_buffer { + struct list_head entry; /* free and allocated entries by address */ + struct rb_node rb_node; /* free entry by size or allocated entry */ + /* by address */ + unsigned free:1; + unsigned allow_user_free:1; + unsigned async_transaction:1; + unsigned debug_id:29; + + struct binder_transaction *transaction; + + struct binder_node *target_node; + size_t data_size; + size_t offsets_size; + size_t extra_buffers_size; + uint8_t data[0]; +}; + +/** + * struct binder_alloc - per-binder proc state for binder allocator + * @vma: vm_area_struct passed to mmap_handler + * (invarient after mmap) + * @tsk: tid for task that called init for this proc + * (invariant after init) + * @vma_vm_mm: copy of vma->vm_mm (invarient after mmap) + * @buffer: base of per-proc address space mapped via mmap + * @user_buffer_offset: offset between user and kernel VAs for buffer + * @buffers: list of all buffers for this proc + * @free_buffers: rb tree of buffers available for allocation + * sorted by size + * @allocated_buffers: rb tree of allocated buffers sorted by address + * @free_async_space: VA space available for async buffers. This is + * initialized at mmap time to 1/2 the full VA space + * @pages: array of physical page addresses for each + * page of mmap'd space + * @buffer_size: size of address space specified via mmap + * @pid: pid for associated binder_proc (invariant after init) + * + * Bookkeeping structure for per-proc address space management for binder + * buffers. It is normally initialized during binder_init() and binder_mmap() + * calls. The address space is used for both user-visible buffers and for + * struct binder_buffer objects used to track the user buffers + */ +struct binder_alloc { + struct mutex mutex; + struct task_struct *tsk; + struct vm_area_struct *vma; + struct mm_struct *vma_vm_mm; + void *buffer; + ptrdiff_t user_buffer_offset; + struct list_head buffers; + struct rb_root free_buffers; + struct rb_root allocated_buffers; + size_t free_async_space; + struct page **pages; + size_t buffer_size; + uint32_t buffer_free; + int pid; +}; + +extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, + size_t data_size, + size_t offsets_size, + size_t extra_buffers_size, + int is_async); +extern void binder_alloc_init(struct binder_alloc *alloc); +extern void binder_alloc_vma_close(struct binder_alloc *alloc); +extern struct binder_buffer * +binder_alloc_buffer_lookup(struct binder_alloc *alloc, + uintptr_t user_ptr); +extern void binder_alloc_free_buf(struct binder_alloc *alloc, + struct binder_buffer *buffer); +extern int binder_alloc_mmap_handler(struct binder_alloc *alloc, + struct vm_area_struct *vma); +extern void binder_alloc_deferred_release(struct binder_alloc *alloc); +extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc); +extern void binder_alloc_print_allocated(struct seq_file *m, + struct binder_alloc *alloc); + +/** + * binder_alloc_get_free_async_space() - get free space available for async + * @alloc: binder_alloc for this proc + * + * Return: the bytes remaining in the address-space for async transactions + */ +static inline size_t +binder_alloc_get_free_async_space(struct binder_alloc *alloc) +{ + size_t free_async_space; + + mutex_lock(&alloc->mutex); + free_async_space = alloc->free_async_space; + mutex_unlock(&alloc->mutex); + return free_async_space; +} + +/** + * binder_alloc_get_user_buffer_offset() - get offset between kernel/user addrs + * @alloc: binder_alloc for this proc + * + * Return: the offset between kernel and user-space addresses to use for + * virtual address conversion + */ +static inline ptrdiff_t +binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc) +{ + /* + * user_buffer_offset is constant if vma is set and + * undefined if vma is not set. It is possible to + * get here with !alloc->vma if the target process + * is dying while a transaction is being initiated. + * Returning the old value is ok in this case and + * the transaction will fail. + */ + return alloc->user_buffer_offset; +} + +#endif /* _LINUX_BINDER_ALLOC_H */ + From a19f3efc4f54593d9e17f3fe7afc1ea78e0ae5f0 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 24 May 2017 11:53:13 -0700 Subject: [PATCH 0905/3220] FROMLIST: binder: remove binder_debug_no_lock mechanism (from https://patchwork.kernel.org/patch/9817811/) With the global lock, there was a mechanism to acceess binder driver debugging information with the global lock disabled to debug deadlocks or other issues. This mechanism is rarely (if ever) used anymore and wasn't needed during the development of fine-grained locking in the binder driver. Removing it. Change-Id: Ie1cf45748cefa433607a97c2ef322e7906efe0f7 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 3251b7cdd717..9fdf1ed9d7f6 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -107,9 +107,6 @@ static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO); -static bool binder_debug_no_lock; -module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO); - static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; module_param_named(devices, binder_devices_param, charp, S_IRUGO); @@ -3508,10 +3505,8 @@ static int binder_state_show(struct seq_file *m, void *unused) { struct binder_proc *proc; struct binder_node *node; - int do_lock = !binder_debug_no_lock; - if (do_lock) - binder_lock(__func__); + binder_lock(__func__); seq_puts(m, "binder state:\n"); @@ -3522,18 +3517,15 @@ static int binder_state_show(struct seq_file *m, void *unused) hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, 1); - if (do_lock) - binder_unlock(__func__); + binder_unlock(__func__); return 0; } static int binder_stats_show(struct seq_file *m, void *unused) { struct binder_proc *proc; - int do_lock = !binder_debug_no_lock; - if (do_lock) - binder_lock(__func__); + binder_lock(__func__); seq_puts(m, "binder stats:\n"); @@ -3541,24 +3533,20 @@ static int binder_stats_show(struct seq_file *m, void *unused) hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc_stats(m, proc); - if (do_lock) - binder_unlock(__func__); + binder_unlock(__func__); return 0; } static int binder_transactions_show(struct seq_file *m, void *unused) { struct binder_proc *proc; - int do_lock = !binder_debug_no_lock; - if (do_lock) - binder_lock(__func__); + binder_lock(__func__); seq_puts(m, "binder transactions:\n"); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, 0); - if (do_lock) - binder_unlock(__func__); + binder_unlock(__func__); return 0; } @@ -3566,10 +3554,8 @@ static int binder_proc_show(struct seq_file *m, void *unused) { struct binder_proc *itr; int pid = (unsigned long)m->private; - int do_lock = !binder_debug_no_lock; - if (do_lock) - binder_lock(__func__); + binder_lock(__func__); hlist_for_each_entry(itr, &binder_procs, proc_node) { if (itr->pid == pid) { @@ -3577,8 +3563,7 @@ static int binder_proc_show(struct seq_file *m, void *unused) print_binder_proc(m, itr, 1); } } - if (do_lock) - binder_unlock(__func__); + binder_unlock(__func__); return 0; } From 3490fdcb77030b103e6a709a63191e09c6e64ac0 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 17 Oct 2016 12:33:15 -0700 Subject: [PATCH 0906/3220] FROMLIST: binder: add protection for non-perf cases (from https://patchwork.kernel.org/patch/9817749/) Add binder_dead_nodes_lock, binder_procs_lock, and binder_context_mgr_node_lock to protect the associated global lists Bug: 33250092 32225111 Change-Id: I9d1158536783763e0fa93b18e19fba2c488d8cfc Signed-off-by: Todd Kjos --- drivers/android/binder.c | 81 +++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 9fdf1ed9d7f6..4b9816409b54 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -45,12 +45,16 @@ #include "binder_trace.h" static DEFINE_MUTEX(binder_main_lock); + +static HLIST_HEAD(binder_deferred_list); static DEFINE_MUTEX(binder_deferred_lock); static HLIST_HEAD(binder_devices); static HLIST_HEAD(binder_procs); -static HLIST_HEAD(binder_deferred_list); +static DEFINE_MUTEX(binder_procs_lock); + static HLIST_HEAD(binder_dead_nodes); +static DEFINE_SPINLOCK(binder_dead_nodes_lock); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; @@ -219,6 +223,8 @@ static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_context { struct binder_node *binder_context_mgr_node; + struct mutex context_mgr_node_lock; + kuid_t binder_context_mgr_uid; const char *name; }; @@ -571,7 +577,9 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal) "refless node %d deleted\n", node->debug_id); } else { + spin_lock(&binder_dead_nodes_lock); hlist_del(&node->dead_node); + spin_unlock(&binder_dead_nodes_lock); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "dead node %d deleted\n", node->debug_id); @@ -1455,11 +1463,14 @@ static void binder_transaction(struct binder_proc *proc, } target_node = ref->node; } else { + mutex_lock(&context->context_mgr_node_lock); target_node = context->binder_context_mgr_node; if (target_node == NULL) { return_error = BR_DEAD_REPLY; + mutex_unlock(&context->context_mgr_node_lock); goto err_no_context_mgr_node; } + mutex_unlock(&context->context_mgr_node_lock); } e->to_node = target_node->debug_id; target_proc = target_node->proc; @@ -1825,22 +1836,31 @@ static int binder_thread_write(struct binder_proc *proc, case BC_RELEASE: case BC_DECREFS: { uint32_t target; - struct binder_ref *ref; + struct binder_ref *ref = NULL; const char *debug_string; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; + ptr += sizeof(uint32_t); - if (target == 0 && context->binder_context_mgr_node && + if (target == 0 && (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) { - ref = binder_get_ref_for_node(proc, - context->binder_context_mgr_node); - if (ref->desc != target) { - binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n", - proc->pid, thread->pid, - ref->desc); + struct binder_node *ctx_mgr_node; + + mutex_lock(&context->context_mgr_node_lock); + ctx_mgr_node = context->binder_context_mgr_node; + if (ctx_mgr_node) { + ref = binder_get_ref_for_node(proc, + ctx_mgr_node); + if (ref && ref->desc != target) { + binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n", + proc->pid, thread->pid, + ref->desc); + } } - } else + mutex_unlock(&context->context_mgr_node_lock); + } + if (ref == NULL) ref = binder_get_ref(proc, target, cmd == BC_ACQUIRE || cmd == BC_RELEASE); @@ -2754,9 +2774,10 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) int ret = 0; struct binder_proc *proc = filp->private_data; struct binder_context *context = proc->context; - + struct binder_node *new_node; kuid_t curr_euid = current_euid(); + mutex_lock(&context->context_mgr_node_lock); if (context->binder_context_mgr_node) { pr_err("BINDER_SET_CONTEXT_MGR already set\n"); ret = -EBUSY; @@ -2777,16 +2798,18 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) } else { context->binder_context_mgr_uid = curr_euid; } - context->binder_context_mgr_node = binder_new_node(proc, 0, 0); - if (!context->binder_context_mgr_node) { + new_node = binder_new_node(proc, 0, 0); + if (!new_node) { ret = -ENOMEM; goto out; } - context->binder_context_mgr_node->local_weak_refs++; - context->binder_context_mgr_node->local_strong_refs++; - context->binder_context_mgr_node->has_strong_ref = 1; - context->binder_context_mgr_node->has_weak_ref = 1; + new_node->local_weak_refs++; + new_node->local_strong_refs++; + new_node->has_strong_ref = 1; + new_node->has_weak_ref = 1; + context->binder_context_mgr_node = new_node; out: + mutex_unlock(&context->context_mgr_node_lock); return ret; } @@ -2966,13 +2989,16 @@ static int binder_open(struct inode *nodp, struct file *filp) binder_lock(__func__); binder_stats_created(BINDER_STAT_PROC); - hlist_add_head(&proc->proc_node, &binder_procs); proc->pid = current->group_leader->pid; INIT_LIST_HEAD(&proc->delivered_death); filp->private_data = proc; binder_unlock(__func__); + mutex_lock(&binder_procs_lock); + hlist_add_head(&proc->proc_node, &binder_procs); + mutex_unlock(&binder_procs_lock); + if (binder_debugfs_dir_entry_proc) { char strbuf[11]; @@ -3051,7 +3077,10 @@ static int binder_node_release(struct binder_node *node, int refs) node->proc = NULL; node->local_strong_refs = 0; node->local_weak_refs = 0; + + spin_lock(&binder_dead_nodes_lock); hlist_add_head(&node->dead_node, &binder_dead_nodes); + spin_unlock(&binder_dead_nodes_lock); hlist_for_each_entry(ref, &node->refs, node_entry) { refs++; @@ -3085,8 +3114,11 @@ static void binder_deferred_release(struct binder_proc *proc) BUG_ON(proc->files); + mutex_lock(&binder_procs_lock); hlist_del(&proc->proc_node); + mutex_unlock(&binder_procs_lock); + mutex_lock(&context->context_mgr_node_lock); if (context->binder_context_mgr_node && context->binder_context_mgr_node->proc == proc) { binder_debug(BINDER_DEBUG_DEAD_BINDER, @@ -3094,6 +3126,7 @@ static void binder_deferred_release(struct binder_proc *proc) __func__, proc->pid); context->binder_context_mgr_node = NULL; } + mutex_unlock(&context->context_mgr_node_lock); threads = 0; active_transactions = 0; @@ -3510,13 +3543,17 @@ static int binder_state_show(struct seq_file *m, void *unused) seq_puts(m, "binder state:\n"); + spin_lock(&binder_dead_nodes_lock); if (!hlist_empty(&binder_dead_nodes)) seq_puts(m, "dead nodes:\n"); hlist_for_each_entry(node, &binder_dead_nodes, dead_node) print_binder_node(m, node); + spin_unlock(&binder_dead_nodes_lock); + mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, 1); + mutex_unlock(&binder_procs_lock); binder_unlock(__func__); return 0; } @@ -3531,8 +3568,10 @@ static int binder_stats_show(struct seq_file *m, void *unused) print_binder_stats(m, "", &binder_stats); + mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc_stats(m, proc); + mutex_unlock(&binder_procs_lock); binder_unlock(__func__); return 0; } @@ -3544,8 +3583,10 @@ static int binder_transactions_show(struct seq_file *m, void *unused) binder_lock(__func__); seq_puts(m, "binder transactions:\n"); + mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, 0); + mutex_unlock(&binder_procs_lock); binder_unlock(__func__); return 0; } @@ -3557,12 +3598,15 @@ static int binder_proc_show(struct seq_file *m, void *unused) binder_lock(__func__); + mutex_lock(&binder_procs_lock); hlist_for_each_entry(itr, &binder_procs, proc_node) { if (itr->pid == pid) { seq_puts(m, "binder proc state:\n"); print_binder_proc(m, itr, 1); } } + mutex_unlock(&binder_procs_lock); + binder_unlock(__func__); return 0; } @@ -3623,6 +3667,7 @@ static int __init init_binder_device(const char *name) binder_device->context.binder_context_mgr_uid = INVALID_UID; binder_device->context.name = name; + mutex_init(&binder_device->context.context_mgr_node_lock); ret = misc_register(&binder_device->miscdev); if (ret < 0) { From f716ecfc028f7733e8549cb2cb1f2851bc471253 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Thu, 13 Oct 2016 16:36:15 -0700 Subject: [PATCH 0907/3220] FROMLIST: binder: change binder_stats to atomics (from https://patchwork.kernel.org/patch/9817755/) Use atomics for stats to avoid needing to lock for increments/decrements Bug: 33250092 32225111 Change-Id: I13e69b7f0485ccf16673e25091455781e1933a98 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 48 +++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 4b9816409b54..0b6ff5f84322 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -167,22 +167,22 @@ enum binder_stat_types { }; struct binder_stats { - int br[_IOC_NR(BR_FAILED_REPLY) + 1]; - int bc[_IOC_NR(BC_REPLY_SG) + 1]; - int obj_created[BINDER_STAT_COUNT]; - int obj_deleted[BINDER_STAT_COUNT]; + atomic_t br[_IOC_NR(BR_FAILED_REPLY) + 1]; + atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1]; + atomic_t obj_created[BINDER_STAT_COUNT]; + atomic_t obj_deleted[BINDER_STAT_COUNT]; }; static struct binder_stats binder_stats; static inline void binder_stats_deleted(enum binder_stat_types type) { - binder_stats.obj_deleted[type]++; + atomic_inc(&binder_stats.obj_deleted[type]); } static inline void binder_stats_created(enum binder_stat_types type) { - binder_stats.obj_created[type]++; + atomic_inc(&binder_stats.obj_created[type]); } struct binder_transaction_log_entry { @@ -1826,9 +1826,9 @@ static int binder_thread_write(struct binder_proc *proc, ptr += sizeof(uint32_t); trace_binder_command(cmd); if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) { - binder_stats.bc[_IOC_NR(cmd)]++; - proc->stats.bc[_IOC_NR(cmd)]++; - thread->stats.bc[_IOC_NR(cmd)]++; + atomic_inc(&binder_stats.bc[_IOC_NR(cmd)]); + atomic_inc(&proc->stats.bc[_IOC_NR(cmd)]); + atomic_inc(&thread->stats.bc[_IOC_NR(cmd)]); } switch (cmd) { case BC_INCREFS: @@ -2202,9 +2202,9 @@ static void binder_stat_br(struct binder_proc *proc, { trace_binder_return(cmd); if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) { - binder_stats.br[_IOC_NR(cmd)]++; - proc->stats.br[_IOC_NR(cmd)]++; - thread->stats.br[_IOC_NR(cmd)]++; + atomic_inc(&binder_stats.br[_IOC_NR(cmd)]); + atomic_inc(&proc->stats.br[_IOC_NR(cmd)]); + atomic_inc(&thread->stats.br[_IOC_NR(cmd)]); } } @@ -3454,17 +3454,21 @@ static void print_binder_stats(struct seq_file *m, const char *prefix, BUILD_BUG_ON(ARRAY_SIZE(stats->bc) != ARRAY_SIZE(binder_command_strings)); for (i = 0; i < ARRAY_SIZE(stats->bc); i++) { - if (stats->bc[i]) + int temp = atomic_read(&stats->bc[i]); + + if (temp) seq_printf(m, "%s%s: %d\n", prefix, - binder_command_strings[i], stats->bc[i]); + binder_command_strings[i], temp); } BUILD_BUG_ON(ARRAY_SIZE(stats->br) != ARRAY_SIZE(binder_return_strings)); for (i = 0; i < ARRAY_SIZE(stats->br); i++) { - if (stats->br[i]) + int temp = atomic_read(&stats->br[i]); + + if (temp) seq_printf(m, "%s%s: %d\n", prefix, - binder_return_strings[i], stats->br[i]); + binder_return_strings[i], temp); } BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != @@ -3472,11 +3476,15 @@ static void print_binder_stats(struct seq_file *m, const char *prefix, BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != ARRAY_SIZE(stats->obj_deleted)); for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) { - if (stats->obj_created[i] || stats->obj_deleted[i]) - seq_printf(m, "%s%s: active %d total %d\n", prefix, + int created = atomic_read(&stats->obj_created[i]); + int deleted = atomic_read(&stats->obj_deleted[i]); + + if (created || deleted) + seq_printf(m, "%s%s: active %d total %d\n", + prefix, binder_objstat_strings[i], - stats->obj_created[i] - stats->obj_deleted[i], - stats->obj_created[i]); + created - deleted, + created); } } From be4dde1f05cda42d74035c50d944a6ec0ef365d7 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 25 May 2017 10:56:00 -0700 Subject: [PATCH 0908/3220] FROMLIST: binder: make binder_last_id an atomic (from https://patchwork.kernel.org/patch/9817809/) Change-Id: I12a505091d377ca9034861317b7e68c2e75f7256 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 0b6ff5f84322..edbd729f2f43 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -58,7 +58,7 @@ static DEFINE_SPINLOCK(binder_dead_nodes_lock); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; -static int binder_last_id; +static atomic_t binder_last_id; static struct workqueue_struct *binder_deferred_workqueue; #define BINDER_DEBUG_ENTRY(name) \ @@ -496,7 +496,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, binder_stats_created(BINDER_STAT_NODE); rb_link_node(&node->rb_node, parent, p); rb_insert_color(&node->rb_node, &proc->nodes); - node->debug_id = ++binder_last_id; + node->debug_id = atomic_inc_return(&binder_last_id); node->proc = proc; node->ptr = ptr; node->cookie = cookie; @@ -640,7 +640,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, if (new_ref == NULL) return NULL; binder_stats_created(BINDER_STAT_REF); - new_ref->debug_id = ++binder_last_id; + new_ref->debug_id = atomic_inc_return(&binder_last_id); new_ref->proc = proc; new_ref->node = node; rb_link_node(&new_ref->rb_node_node, parent, p); @@ -1528,7 +1528,7 @@ static void binder_transaction(struct binder_proc *proc, } binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); - t->debug_id = ++binder_last_id; + t->debug_id = atomic_inc_return(&binder_last_id); e->debug_id = t->debug_id; if (reply) From 0a0fdc1fdc20beec7da852d2392f3111e3555983 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 22 Mar 2017 17:19:52 -0700 Subject: [PATCH 0909/3220] FROMLIST: binder: add log information for binder transaction failures (from https://patchwork.kernel.org/patch/9817751/) Add additional information to determine the cause of binder failures. Adds the following to failed transaction log and kernel messages: return_error : value returned for transaction return_error_param : errno returned by binder allocator return_error_line : line number where error detected Also, return BR_DEAD_REPLY if an allocation error indicates a dead proc (-ESRCH) Bug: 36406078 Change-Id: Ifc8881fa5adfcced3f2d67f9030fbd3efa3e2cab Test: tested manually Signed-off-by: Todd Kjos --- drivers/android/binder.c | 87 ++++++++++++++++++++++++++++++---- drivers/android/binder_alloc.c | 20 ++++---- 2 files changed, 88 insertions(+), 19 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index edbd729f2f43..5990f7fbcbd3 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -196,6 +196,9 @@ struct binder_transaction_log_entry { int to_node; int data_size; int offsets_size; + int return_error_line; + uint32_t return_error; + uint32_t return_error_param; const char *context_name; }; struct binder_transaction_log { @@ -1143,7 +1146,7 @@ static int binder_translate_binder(struct flat_binder_object *fp, ref = binder_get_ref_for_node(target_proc, node); if (!ref) - return -EINVAL; + return -ENOMEM; if (fp->hdr.type == BINDER_TYPE_BINDER) fp->hdr.type = BINDER_TYPE_HANDLE; @@ -1200,7 +1203,7 @@ static int binder_translate_handle(struct flat_binder_object *fp, new_ref = binder_get_ref_for_node(target_proc, ref->node); if (!new_ref) - return -EINVAL; + return -ENOMEM; fp->binder = 0; fp->handle = new_ref->desc; @@ -1398,7 +1401,9 @@ static void binder_transaction(struct binder_proc *proc, wait_queue_head_t *target_wait; struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; - uint32_t return_error; + uint32_t return_error = 0; + uint32_t return_error_param = 0; + uint32_t return_error_line = 0; struct binder_buffer_object *last_fixup_obj = NULL; binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; @@ -1418,6 +1423,8 @@ static void binder_transaction(struct binder_proc *proc, binder_user_error("%d:%d got reply transaction with no transaction stack\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; + return_error_param = -EPROTO; + return_error_line = __LINE__; goto err_empty_call_stack; } binder_set_nice(in_reply_to->saved_priority); @@ -1429,6 +1436,8 @@ static void binder_transaction(struct binder_proc *proc, in_reply_to->to_thread ? in_reply_to->to_thread->pid : 0); return_error = BR_FAILED_REPLY; + return_error_param = -EPROTO; + return_error_line = __LINE__; in_reply_to = NULL; goto err_bad_call_stack; } @@ -1436,6 +1445,7 @@ static void binder_transaction(struct binder_proc *proc, target_thread = in_reply_to->from; if (target_thread == NULL) { return_error = BR_DEAD_REPLY; + return_error_line = __LINE__; goto err_dead_binder; } if (target_thread->transaction_stack != in_reply_to) { @@ -1445,6 +1455,8 @@ static void binder_transaction(struct binder_proc *proc, target_thread->transaction_stack->debug_id : 0, in_reply_to->debug_id); return_error = BR_FAILED_REPLY; + return_error_param = -EPROTO; + return_error_line = __LINE__; in_reply_to = NULL; target_thread = NULL; goto err_dead_binder; @@ -1459,6 +1471,8 @@ static void binder_transaction(struct binder_proc *proc, binder_user_error("%d:%d got transaction to invalid handle\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; + return_error_param = -EINVAL; + return_error_line = __LINE__; goto err_invalid_target_handle; } target_node = ref->node; @@ -1468,6 +1482,7 @@ static void binder_transaction(struct binder_proc *proc, if (target_node == NULL) { return_error = BR_DEAD_REPLY; mutex_unlock(&context->context_mgr_node_lock); + return_error_line = __LINE__; goto err_no_context_mgr_node; } mutex_unlock(&context->context_mgr_node_lock); @@ -1476,11 +1491,14 @@ static void binder_transaction(struct binder_proc *proc, target_proc = target_node->proc; if (target_proc == NULL) { return_error = BR_DEAD_REPLY; + return_error_line = __LINE__; goto err_dead_binder; } if (security_binder_transaction(proc->tsk, target_proc->tsk) < 0) { return_error = BR_FAILED_REPLY; + return_error_param = -EPERM; + return_error_line = __LINE__; goto err_invalid_target_handle; } if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) { @@ -1494,6 +1512,8 @@ static void binder_transaction(struct binder_proc *proc, tmp->to_thread ? tmp->to_thread->pid : 0); return_error = BR_FAILED_REPLY; + return_error_param = -EPROTO; + return_error_line = __LINE__; goto err_bad_call_stack; } while (tmp) { @@ -1517,6 +1537,8 @@ static void binder_transaction(struct binder_proc *proc, t = kzalloc(sizeof(*t), GFP_KERNEL); if (t == NULL) { return_error = BR_FAILED_REPLY; + return_error_param = -ENOMEM; + return_error_line = __LINE__; goto err_alloc_t_failed; } binder_stats_created(BINDER_STAT_TRANSACTION); @@ -1524,6 +1546,8 @@ static void binder_transaction(struct binder_proc *proc, tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); if (tcomplete == NULL) { return_error = BR_FAILED_REPLY; + return_error_param = -ENOMEM; + return_error_line = __LINE__; goto err_alloc_tcomplete_failed; } binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); @@ -1566,8 +1590,15 @@ static void binder_transaction(struct binder_proc *proc, t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size, tr->offsets_size, extra_buffers_size, !reply && (t->flags & TF_ONE_WAY)); - if (t->buffer == NULL) { - return_error = BR_FAILED_REPLY; + if (IS_ERR(t->buffer)) { + /* + * -ESRCH indicates VMA cleared. The target is dying. + */ + return_error_param = PTR_ERR(t->buffer); + return_error = return_error_param == -ESRCH ? + BR_DEAD_REPLY : BR_FAILED_REPLY; + return_error_line = __LINE__; + t->buffer = NULL; goto err_binder_alloc_buf_failed; } t->buffer->allow_user_free = 0; @@ -1587,6 +1618,8 @@ static void binder_transaction(struct binder_proc *proc, binder_user_error("%d:%d got transaction with invalid data ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; + return_error_param = -EFAULT; + return_error_line = __LINE__; goto err_copy_data_failed; } if (copy_from_user(offp, (const void __user *)(uintptr_t) @@ -1594,12 +1627,16 @@ static void binder_transaction(struct binder_proc *proc, binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; + return_error_param = -EFAULT; + return_error_line = __LINE__; goto err_copy_data_failed; } if (!IS_ALIGNED(tr->offsets_size, sizeof(binder_size_t))) { binder_user_error("%d:%d got transaction with invalid offsets size, %lld\n", proc->pid, thread->pid, (u64)tr->offsets_size); return_error = BR_FAILED_REPLY; + return_error_param = -EINVAL; + return_error_line = __LINE__; goto err_bad_offset; } if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) { @@ -1607,6 +1644,8 @@ static void binder_transaction(struct binder_proc *proc, proc->pid, thread->pid, (u64)extra_buffers_size); return_error = BR_FAILED_REPLY; + return_error_param = -EINVAL; + return_error_line = __LINE__; goto err_bad_offset; } off_end = (void *)off_start + tr->offsets_size; @@ -1623,6 +1662,8 @@ static void binder_transaction(struct binder_proc *proc, (u64)off_min, (u64)t->buffer->data_size); return_error = BR_FAILED_REPLY; + return_error_param = -EINVAL; + return_error_line = __LINE__; goto err_bad_offset; } @@ -1637,6 +1678,8 @@ static void binder_transaction(struct binder_proc *proc, ret = binder_translate_binder(fp, t, thread); if (ret < 0) { return_error = BR_FAILED_REPLY; + return_error_param = ret; + return_error_line = __LINE__; goto err_translate_failed; } } break; @@ -1648,6 +1691,8 @@ static void binder_transaction(struct binder_proc *proc, ret = binder_translate_handle(fp, t, thread); if (ret < 0) { return_error = BR_FAILED_REPLY; + return_error_param = ret; + return_error_line = __LINE__; goto err_translate_failed; } } break; @@ -1659,6 +1704,8 @@ static void binder_transaction(struct binder_proc *proc, if (target_fd < 0) { return_error = BR_FAILED_REPLY; + return_error_param = target_fd; + return_error_line = __LINE__; goto err_translate_failed; } fp->pad_binder = 0; @@ -1675,6 +1722,8 @@ static void binder_transaction(struct binder_proc *proc, binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; + return_error_param = -EINVAL; + return_error_line = __LINE__; goto err_bad_parent; } if (!binder_validate_fixup(t->buffer, off_start, @@ -1684,12 +1733,16 @@ static void binder_transaction(struct binder_proc *proc, binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; + return_error_param = -EINVAL; + return_error_line = __LINE__; goto err_bad_parent; } ret = binder_translate_fd_array(fda, parent, t, thread, in_reply_to); if (ret < 0) { return_error = BR_FAILED_REPLY; + return_error_param = ret; + return_error_line = __LINE__; goto err_translate_failed; } last_fixup_obj = parent; @@ -1705,6 +1758,8 @@ static void binder_transaction(struct binder_proc *proc, binder_user_error("%d:%d got transaction with too large buffer\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; + return_error_param = -EINVAL; + return_error_line = __LINE__; goto err_bad_offset; } if (copy_from_user(sg_bufp, @@ -1712,7 +1767,9 @@ static void binder_transaction(struct binder_proc *proc, bp->buffer, bp->length)) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); + return_error_param = -EFAULT; return_error = BR_FAILED_REPLY; + return_error_line = __LINE__; goto err_copy_data_failed; } /* Fixup buffer pointer to target proc address space */ @@ -1727,6 +1784,8 @@ static void binder_transaction(struct binder_proc *proc, last_fixup_min_off); if (ret < 0) { return_error = BR_FAILED_REPLY; + return_error_param = ret; + return_error_line = __LINE__; goto err_translate_failed; } last_fixup_obj = bp; @@ -1736,6 +1795,8 @@ static void binder_transaction(struct binder_proc *proc, binder_user_error("%d:%d got transaction with invalid object type, %x\n", proc->pid, thread->pid, hdr->type); return_error = BR_FAILED_REPLY; + return_error_param = -EINVAL; + return_error_line = __LINE__; goto err_bad_object_type; } } @@ -1790,13 +1851,17 @@ err_dead_binder: err_invalid_target_handle: err_no_context_mgr_node: binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "%d:%d transaction failed %d, size %lld-%lld\n", - proc->pid, thread->pid, return_error, - (u64)tr->data_size, (u64)tr->offsets_size); + "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n", + proc->pid, thread->pid, return_error, return_error_param, + (u64)tr->data_size, (u64)tr->offsets_size, + return_error_line); { struct binder_transaction_log_entry *fe; + e->return_error = return_error; + e->return_error_param = return_error_param; + e->return_error_line = return_error_line; fe = binder_transaction_log_add(&binder_transaction_log_failed); *fe = *e; } @@ -3623,11 +3688,13 @@ static void print_binder_transaction_log_entry(struct seq_file *m, struct binder_transaction_log_entry *e) { seq_printf(m, - "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d\n", + "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d\n", e->debug_id, (e->call_type == 2) ? "reply" : ((e->call_type == 1) ? "async" : "call "), e->from_proc, e->from_thread, e->to_proc, e->to_thread, e->context_name, - e->to_node, e->target_handle, e->data_size, e->offsets_size); + e->to_node, e->target_handle, e->data_size, e->offsets_size, + e->return_error, e->return_error_param, + e->return_error_line); } static int binder_transaction_log_show(struct seq_file *m, void *unused) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index eaef8f8fb859..16bc8a3b53f5 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -262,7 +262,7 @@ err_no_vma: up_write(&mm->mmap_sem); mmput(mm); } - return -ENOMEM; + return vma ? -ENOMEM : -ESRCH; } struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, @@ -278,11 +278,12 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, void *has_page_addr; void *end_page_addr; size_t size, data_offsets_size; + int ret; if (alloc->vma == NULL) { pr_err("%d: binder_alloc_buf, no vma\n", alloc->pid); - return NULL; + return ERR_PTR(-ESRCH); } data_offsets_size = ALIGN(data_size, sizeof(void *)) + @@ -292,21 +293,21 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: got transaction with invalid size %zd-%zd\n", alloc->pid, data_size, offsets_size); - return NULL; + return ERR_PTR(-EINVAL); } size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *)); if (size < data_offsets_size || size < extra_buffers_size) { binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: got transaction with invalid extra_buffers_size %zd\n", alloc->pid, extra_buffers_size); - return NULL; + return ERR_PTR(-EINVAL); } if (is_async && alloc->free_async_space < size + sizeof(struct binder_buffer)) { binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: binder_alloc_buf size %zd failed, no async space left\n", alloc->pid, size); - return NULL; + return ERR_PTR(-ENOSPC); } while (n) { @@ -327,7 +328,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, if (best_fit == NULL) { pr_err("%d: binder_alloc_buf size %zd failed, no address space\n", alloc->pid, size); - return NULL; + return ERR_PTR(-ENOSPC); } if (n == NULL) { buffer = rb_entry(best_fit, struct binder_buffer, rb_node); @@ -350,9 +351,10 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size); if (end_page_addr > has_page_addr) end_page_addr = has_page_addr; - if (binder_update_page_range(alloc, 1, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL)) - return NULL; + ret = binder_update_page_range(alloc, 1, + (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL); + if (ret) + return ERR_PTR(ret); rb_erase(best_fit, &alloc->free_buffers); buffer->free = 0; From 9b9340c58afa2cf2a76ae06f3019ad631e6f7799 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 24 May 2017 10:51:01 -0700 Subject: [PATCH 0910/3220] FROMLIST: binder: refactor queue management in binder_thread_read (from https://patchwork.kernel.org/patch/9817757/) In binder_thread_read, the BINDER_WORK_NODE command is used to communicate the references on the node to userspace. It can take a couple of iterations in the loop to construct the list of commands for user space. When locking is added, the lock would need to be release on each iteration which means the state could change. The work item is not dequeued during this process which prevents a simpler queue management that can just dequeue up front and handle the work item. Fixed by changing the BINDER_WORK_NODE algorithm in binder_thread_read to determine which commands to send to userspace atomically in 1 pass so it stays consistent with the kernel view. The work item is now dequeued immediately since only 1 pass is needed. Change-Id: I9b4109997b2d53ba661867b14d7336cd076be06d Signed-off-by: Todd Kjos --- drivers/android/binder.c | 147 ++++++++++++++++++++++++--------------- 1 file changed, 92 insertions(+), 55 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 5990f7fbcbd3..4e7439506924 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2286,6 +2286,37 @@ static int binder_has_thread_work(struct binder_thread *thread) (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN); } +static int binder_put_node_cmd(struct binder_proc *proc, + struct binder_thread *thread, + void __user **ptrp, + binder_uintptr_t node_ptr, + binder_uintptr_t node_cookie, + int node_debug_id, + uint32_t cmd, const char *cmd_name) +{ + void __user *ptr = *ptrp; + + if (put_user(cmd, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + + if (put_user(node_ptr, (binder_uintptr_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(binder_uintptr_t); + + if (put_user(node_cookie, (binder_uintptr_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(binder_uintptr_t); + + binder_stat_br(proc, thread, cmd); + binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s %d u%016llx c%016llx\n", + proc->pid, thread->pid, cmd_name, node_debug_id, + (u64)node_ptr, (u64)node_cookie); + + *ptrp = ptr; + return 0; +} + static int binder_thread_read(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, @@ -2411,72 +2442,78 @@ retry: } break; case BINDER_WORK_NODE: { struct binder_node *node = container_of(w, struct binder_node, work); - uint32_t cmd = BR_NOOP; - const char *cmd_name; - int strong = node->internal_strong_refs || node->local_strong_refs; - int weak = !hlist_empty(&node->refs) || node->local_weak_refs || strong; + int strong, weak; + binder_uintptr_t node_ptr = node->ptr; + binder_uintptr_t node_cookie = node->cookie; + int node_debug_id = node->debug_id; + int has_weak_ref; + int has_strong_ref; + void __user *orig_ptr = ptr; - if (weak && !node->has_weak_ref) { - cmd = BR_INCREFS; - cmd_name = "BR_INCREFS"; + BUG_ON(proc != node->proc); + strong = node->internal_strong_refs || + node->local_strong_refs; + weak = !hlist_empty(&node->refs) || + node->local_weak_refs || strong; + has_strong_ref = node->has_strong_ref; + has_weak_ref = node->has_weak_ref; + + if (weak && !has_weak_ref) { node->has_weak_ref = 1; node->pending_weak_ref = 1; node->local_weak_refs++; - } else if (strong && !node->has_strong_ref) { - cmd = BR_ACQUIRE; - cmd_name = "BR_ACQUIRE"; + } + if (strong && !has_strong_ref) { node->has_strong_ref = 1; node->pending_strong_ref = 1; node->local_strong_refs++; - } else if (!strong && node->has_strong_ref) { - cmd = BR_RELEASE; - cmd_name = "BR_RELEASE"; + } + if (!strong && has_strong_ref) node->has_strong_ref = 0; - } else if (!weak && node->has_weak_ref) { - cmd = BR_DECREFS; - cmd_name = "BR_DECREFS"; + if (!weak && has_weak_ref) node->has_weak_ref = 0; - } - if (cmd != BR_NOOP) { - if (put_user(cmd, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - if (put_user(node->ptr, - (binder_uintptr_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(binder_uintptr_t); - if (put_user(node->cookie, - (binder_uintptr_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(binder_uintptr_t); + list_del(&w->entry); - binder_stat_br(proc, thread, cmd); - binder_debug(BINDER_DEBUG_USER_REFS, - "%d:%d %s %d u%016llx c%016llx\n", - proc->pid, thread->pid, cmd_name, - node->debug_id, - (u64)node->ptr, (u64)node->cookie); - } else { - list_del_init(&w->entry); - if (!weak && !strong) { - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "%d:%d node %d u%016llx c%016llx deleted\n", - proc->pid, thread->pid, - node->debug_id, - (u64)node->ptr, - (u64)node->cookie); - rb_erase(&node->rb_node, &proc->nodes); - kfree(node); - binder_stats_deleted(BINDER_STAT_NODE); - } else { - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "%d:%d node %d u%016llx c%016llx state unchanged\n", - proc->pid, thread->pid, - node->debug_id, - (u64)node->ptr, - (u64)node->cookie); - } + if (!weak && !strong) { + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "%d:%d node %d u%016llx c%016llx deleted\n", + proc->pid, thread->pid, + node_debug_id, + (u64)node_ptr, + (u64)node_cookie); + rb_erase(&node->rb_node, &proc->nodes); + kfree(node); + binder_stats_deleted(BINDER_STAT_NODE); } + if (weak && !has_weak_ref) + ret = binder_put_node_cmd( + proc, thread, &ptr, node_ptr, + node_cookie, node_debug_id, + BR_INCREFS, "BR_INCREFS"); + if (!ret && strong && !has_strong_ref) + ret = binder_put_node_cmd( + proc, thread, &ptr, node_ptr, + node_cookie, node_debug_id, + BR_ACQUIRE, "BR_ACQUIRE"); + if (!ret && !strong && has_strong_ref) + ret = binder_put_node_cmd( + proc, thread, &ptr, node_ptr, + node_cookie, node_debug_id, + BR_RELEASE, "BR_RELEASE"); + if (!ret && !weak && has_weak_ref) + ret = binder_put_node_cmd( + proc, thread, &ptr, node_ptr, + node_cookie, node_debug_id, + BR_DECREFS, "BR_DECREFS"); + if (orig_ptr == ptr) + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "%d:%d node %d u%016llx c%016llx state unchanged\n", + proc->pid, thread->pid, + node_debug_id, + (u64)node_ptr, + (u64)node_cookie); + if (ret) + return ret; } break; case BINDER_WORK_DEAD_BINDER: case BINDER_WORK_DEAD_BINDER_AND_CLEAR: From 6ea6027115718c27e01e4459901c7938492cfa8f Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Tue, 9 May 2017 08:31:32 -0700 Subject: [PATCH 0911/3220] FROMLIST: binder: avoid race conditions when enqueuing txn (from https://patchwork.kernel.org/patch/9817813/) Currently, the transaction complete work item is queued after the transaction. This means that it is possible for the transaction to be handled and a reply to be enqueued in the current thread before the transaction complete is enqueued, which violates the protocol with userspace who may not expect the transaction complete. Fixed by always enqueing the transaction complete first. Also, once the transaction is enqueued, it is unsafe to access since it might be freed. Currently, t->flags is accessed to determine whether a sync wake is needed. Changed to access tr->flags instead. Change-Id: I6c01566e167a39cf17c9027c3817618182e56975 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 4e7439506924..5b1e45cef89e 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1800,6 +1800,9 @@ static void binder_transaction(struct binder_proc *proc, goto err_bad_object_type; } } + tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; + list_add_tail(&tcomplete->entry, &thread->todo); + if (reply) { BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction(target_thread, in_reply_to); @@ -1819,10 +1822,8 @@ static void binder_transaction(struct binder_proc *proc, } t->work.type = BINDER_WORK_TRANSACTION; list_add_tail(&t->work.entry, target_list); - tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; - list_add_tail(&tcomplete->entry, &thread->todo); if (target_wait) { - if (reply || !(t->flags & TF_ONE_WAY)) + if (reply || !(tr->flags & TF_ONE_WAY)) wake_up_interruptible_sync(target_wait); else wake_up_interruptible(target_wait); From afda44d0aa52f4427a6867fe784997f307b7fe31 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 6 Jan 2017 14:19:25 -0800 Subject: [PATCH 0912/3220] FROMLIST: binder: don't modify thread->looper from other threads (from https://patchwork.kernel.org/patch/9817799/) The looper member of struct binder_thread is a bitmask of control bits. All of the existing bits are modified by the affected thread except for BINDER_LOOPER_STATE_NEED_RETURN which can be modified in binder_deferred_flush() by another thread. To avoid adding a spinlock around all read-mod-writes to modify a bit, the BINDER_LOOPER_STATE_NEED_RETURN flag is replaced by a separate field in struct binder_thread. Bug: 33250092 32225111 Change-Id: Ia4cefbdbd683c6cb17c323ba7d278de5f2ca0745 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 5b1e45cef89e..6eb9fc064846 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -334,14 +334,14 @@ enum { BINDER_LOOPER_STATE_EXITED = 0x04, BINDER_LOOPER_STATE_INVALID = 0x08, BINDER_LOOPER_STATE_WAITING = 0x10, - BINDER_LOOPER_STATE_NEED_RETURN = 0x20 }; struct binder_thread { struct binder_proc *proc; struct rb_node rb_node; int pid; - int looper; + int looper; /* only modified by this thread */ + bool looper_need_return; /* can be written by other thread */ struct binder_transaction *transaction_stack; struct list_head todo; uint32_t return_error; /* Write failed, return error code in read buf */ @@ -2277,14 +2277,13 @@ static void binder_stat_br(struct binder_proc *proc, static int binder_has_proc_work(struct binder_proc *proc, struct binder_thread *thread) { - return !list_empty(&proc->todo) || - (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN); + return !list_empty(&proc->todo) || thread->looper_need_return; } static int binder_has_thread_work(struct binder_thread *thread) { return !list_empty(&thread->todo) || thread->return_error != BR_OK || - (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN); + thread->looper_need_return; } static int binder_put_node_cmd(struct binder_proc *proc, @@ -2413,8 +2412,7 @@ retry: entry); } else { /* no data added */ - if (ptr - buffer == 4 && - !(thread->looper & BINDER_LOOPER_STATE_NEED_RETURN)) + if (ptr - buffer == 4 && !thread->looper_need_return) goto retry; break; } @@ -2728,7 +2726,7 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc) INIT_LIST_HEAD(&thread->todo); rb_link_node(&thread->rb_node, parent, p); rb_insert_color(&thread->rb_node, &proc->threads); - thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN; + thread->looper_need_return = true; thread->return_error = BR_OK; thread->return_error2 = BR_OK; } @@ -2984,7 +2982,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ret = 0; err: if (thread) - thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN; + thread->looper_need_return = false; binder_unlock(__func__); wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret && ret != -ERESTARTSYS) @@ -3139,7 +3137,7 @@ static void binder_deferred_flush(struct binder_proc *proc) for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node); - thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN; + thread->looper_need_return = true; if (thread->looper & BINDER_LOOPER_STATE_WAITING) { wake_up_interruptible(&thread->wait); wake_count++; @@ -3400,7 +3398,9 @@ static void print_binder_thread(struct seq_file *m, size_t start_pos = m->count; size_t header_pos; - seq_printf(m, " thread %d: l %02x\n", thread->pid, thread->looper); + seq_printf(m, " thread %d: l %02x need_return %d\n", + thread->pid, thread->looper, + thread->looper_need_return); header_pos = m->count; t = thread->transaction_stack; while (t) { From ce9b7747d6a1cb1b4b772b4e2c524ba1a56b551b Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 1 May 2017 17:21:51 -0700 Subject: [PATCH 0913/3220] FROMLIST: binder: remove dead code in binder_get_ref_for_node (from https://patchwork.kernel.org/patch/9817819/) node is always non-NULL in binder_get_ref_for_node so the conditional and else clause are not needed Change-Id: I23f011ba59e1869d9577e6bf28e1f1dd38f45713 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 6eb9fc064846..812c86bae4db 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -671,18 +671,12 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, } rb_link_node(&new_ref->rb_node_desc, parent, p); rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc); - if (node) { - hlist_add_head(&new_ref->node_entry, &node->refs); + hlist_add_head(&new_ref->node_entry, &node->refs); - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "%d new ref %d desc %d for node %d\n", - proc->pid, new_ref->debug_id, new_ref->desc, - node->debug_id); - } else { - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "%d new ref %d desc %d for dead node\n", - proc->pid, new_ref->debug_id, new_ref->desc); - } + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "%d new ref %d desc %d for node %d\n", + proc->pid, new_ref->debug_id, new_ref->desc, + node->debug_id); return new_ref; } From db51658467e03c756b7de23eb03ebeb63e607d4c Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 21 Apr 2017 14:32:11 -0700 Subject: [PATCH 0914/3220] FROMLIST: binder: protect against two threads freeing buffer (from https://patchwork.kernel.org/patch/9817815/) Adds protection against malicious user code freeing the same buffer at the same time which could cause a crash. Cannot happen under normal use. Bug: 36650912 Change-Id: I43e078cbf31c0789aaff5ceaf8f1a94c75f79d45 Test: tested manually Signed-off-by: Todd Kjos --- drivers/android/binder.c | 4 ++-- drivers/android/binder_alloc.c | 22 +++++++++++++++++----- drivers/android/binder_alloc.h | 7 ++++--- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 812c86bae4db..d484655133ee 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2025,8 +2025,8 @@ static int binder_thread_write(struct binder_proc *proc, return -EFAULT; ptr += sizeof(binder_uintptr_t); - buffer = binder_alloc_buffer_lookup(&proc->alloc, - data_ptr); + buffer = binder_alloc_prepare_to_free(&proc->alloc, + data_ptr); if (buffer == NULL) { binder_user_error("%d:%d BC_FREE_BUFFER u%016llx no match\n", proc->pid, thread->pid, (u64)data_ptr); diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 16bc8a3b53f5..8f79df3f3613 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -116,7 +116,7 @@ static void binder_insert_allocated_buffer_locked( rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers); } -static struct binder_buffer *binder_alloc_buffer_lookup_locked( +static struct binder_buffer *binder_alloc_prepare_to_free_locked( struct binder_alloc *alloc, uintptr_t user_ptr) { @@ -135,8 +135,19 @@ static struct binder_buffer *binder_alloc_buffer_lookup_locked( n = n->rb_left; else if (kern_ptr > buffer) n = n->rb_right; - else + else { + /* + * Guard against user threads attempting to + * free the buffer twice + */ + if (buffer->free_in_progress) { + pr_err("%d:%d FREE_BUFFER u%016llx user freed buffer twice\n", + alloc->pid, current->pid, (u64)user_ptr); + return NULL; + } + buffer->free_in_progress = 1; return buffer; + } } return NULL; } @@ -152,13 +163,13 @@ static struct binder_buffer *binder_alloc_buffer_lookup_locked( * * Return: Pointer to buffer or NULL */ -struct binder_buffer *binder_alloc_buffer_lookup(struct binder_alloc *alloc, - uintptr_t user_ptr) +struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc, + uintptr_t user_ptr) { struct binder_buffer *buffer; mutex_lock(&alloc->mutex); - buffer = binder_alloc_buffer_lookup_locked(alloc, user_ptr); + buffer = binder_alloc_prepare_to_free_locked(alloc, user_ptr); mutex_unlock(&alloc->mutex); return buffer; } @@ -358,6 +369,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, rb_erase(best_fit, &alloc->free_buffers); buffer->free = 0; + buffer->free_in_progress = 0; binder_insert_allocated_buffer_locked(alloc, buffer); if (buffer_size != size) { struct binder_buffer *new_buffer = (void *)buffer->data + size; diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 721c511431f9..088e4ffc6230 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -48,7 +48,8 @@ struct binder_buffer { unsigned free:1; unsigned allow_user_free:1; unsigned async_transaction:1; - unsigned debug_id:29; + unsigned free_in_progress:1; + unsigned debug_id:28; struct binder_transaction *transaction; @@ -109,8 +110,8 @@ extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, extern void binder_alloc_init(struct binder_alloc *alloc); extern void binder_alloc_vma_close(struct binder_alloc *alloc); extern struct binder_buffer * -binder_alloc_buffer_lookup(struct binder_alloc *alloc, - uintptr_t user_ptr); +binder_alloc_prepare_to_free(struct binder_alloc *alloc, + uintptr_t user_ptr); extern void binder_alloc_free_buf(struct binder_alloc *alloc, struct binder_buffer *buffer); extern int binder_alloc_mmap_handler(struct binder_alloc *alloc, From 42e1ca789418a06b5ed727f527bb0f329bc28c26 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Wed, 15 Mar 2017 18:22:52 +0100 Subject: [PATCH 0915/3220] FROMLIST: binder: add more debug info when allocation fails. (from https://patchwork.kernel.org/patch/9817797/) Bug: 36088202 Test: tested manually Change-Id: Ib526a9c375e6136669b72f341e0b54d896fd1cec Signed-off-by: Martijn Coenen Signed-off-by: Siqi Lin --- drivers/android/binder_alloc.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 8f79df3f3613..aabfebac6e57 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -337,8 +337,36 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, } } if (best_fit == NULL) { + size_t allocated_buffers = 0; + size_t largest_alloc_size = 0; + size_t total_alloc_size = 0; + size_t free_buffers = 0; + size_t largest_free_size = 0; + size_t total_free_size = 0; + + for (n = rb_first(&alloc->allocated_buffers); n != NULL; + n = rb_next(n)) { + buffer = rb_entry(n, struct binder_buffer, rb_node); + buffer_size = binder_alloc_buffer_size(alloc, buffer); + allocated_buffers++; + total_alloc_size += buffer_size; + if (buffer_size > largest_alloc_size) + largest_alloc_size = buffer_size; + } + for (n = rb_first(&alloc->free_buffers); n != NULL; + n = rb_next(n)) { + buffer = rb_entry(n, struct binder_buffer, rb_node); + buffer_size = binder_alloc_buffer_size(alloc, buffer); + free_buffers++; + total_free_size += buffer_size; + if (buffer_size > largest_free_size) + largest_free_size = buffer_size; + } pr_err("%d: binder_alloc_buf size %zd failed, no address space\n", alloc->pid, size); + pr_err("allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n", + total_alloc_size, allocated_buffers, largest_alloc_size, + total_free_size, free_buffers, largest_free_size); return ERR_PTR(-ENOSPC); } if (n == NULL) { @@ -698,9 +726,10 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) static void print_binder_buffer(struct seq_file *m, const char *prefix, struct binder_buffer *buffer) { - seq_printf(m, "%s %d: %pK size %zd:%zd %s\n", + seq_printf(m, "%s %d: %pK size %zd:%zd:%zd %s\n", prefix, buffer->debug_id, buffer->data, buffer->data_size, buffer->offsets_size, + buffer->extra_buffers_size, buffer->transaction ? "active" : "delivered"); } From 0f32aeb35f8b8c3d07f1c03062cc195590145d35 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 24 May 2017 13:33:28 -0700 Subject: [PATCH 0916/3220] FROMLIST: binder: use atomic for transaction_log index (from https://patchwork.kernel.org/patch/9817807/) The log->next index for the transaction log was not protected when incremented. This led to a case where log->next++ resulted in an index larger than ARRAY_SIZE(log->entry) and eventually a bad access to memory. Fixed by making the log index an atomic64 and converting to an array by using "% ARRAY_SIZE(log->entry)" Also added "complete" field to the log entry which is written last to tell the print code whether the entry is complete Bug: 62038227 Test: tested manually Change-Id: I1bb1c1a332a6ac458a626f5bedd05022b56b91f2 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 74 +++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 16 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index d484655133ee..33d7e981324d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -187,6 +187,7 @@ static inline void binder_stats_created(enum binder_stat_types type) struct binder_transaction_log_entry { int debug_id; + int debug_id_done; int call_type; int from_proc; int from_thread; @@ -202,8 +203,8 @@ struct binder_transaction_log_entry { const char *context_name; }; struct binder_transaction_log { - int next; - int full; + atomic_t cur; + bool full; struct binder_transaction_log_entry entry[32]; }; static struct binder_transaction_log binder_transaction_log; @@ -213,14 +214,19 @@ static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_transaction_log *log) { struct binder_transaction_log_entry *e; + unsigned int cur = atomic_inc_return(&log->cur); - e = &log->entry[log->next]; - memset(e, 0, sizeof(*e)); - log->next++; - if (log->next == ARRAY_SIZE(log->entry)) { - log->next = 0; + if (cur >= ARRAY_SIZE(log->entry)) log->full = 1; - } + e = &log->entry[cur % ARRAY_SIZE(log->entry)]; + WRITE_ONCE(e->debug_id_done, 0); + /* + * write-barrier to synchronize access to e->debug_id_done. + * We make sure the initialized 0 value is seen before + * memset() other fields are zeroed by memset. + */ + smp_wmb(); + memset(e, 0, sizeof(*e)); return e; } @@ -1401,8 +1407,10 @@ static void binder_transaction(struct binder_proc *proc, struct binder_buffer_object *last_fixup_obj = NULL; binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; + int t_debug_id = atomic_inc_return(&binder_last_id); e = binder_transaction_log_add(&binder_transaction_log); + e->debug_id = t_debug_id; e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); e->from_proc = proc->pid; e->from_thread = thread->pid; @@ -1546,8 +1554,7 @@ static void binder_transaction(struct binder_proc *proc, } binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); - t->debug_id = atomic_inc_return(&binder_last_id); - e->debug_id = t->debug_id; + t->debug_id = t_debug_id; if (reply) binder_debug(BINDER_DEBUG_TRANSACTION, @@ -1822,6 +1829,12 @@ static void binder_transaction(struct binder_proc *proc, else wake_up_interruptible(target_wait); } + /* + * write barrier to synchronize with initialization + * of log entry + */ + smp_wmb(); + WRITE_ONCE(e->debug_id_done, t_debug_id); return; err_translate_failed: @@ -1859,6 +1872,13 @@ err_no_context_mgr_node: e->return_error_line = return_error_line; fe = binder_transaction_log_add(&binder_transaction_log_failed); *fe = *e; + /* + * write barrier to synchronize with initialization + * of log entry + */ + smp_wmb(); + WRITE_ONCE(e->debug_id_done, t_debug_id); + WRITE_ONCE(fe->debug_id_done, t_debug_id); } BUG_ON(thread->return_error != BR_OK); @@ -3719,27 +3739,47 @@ static int binder_proc_show(struct seq_file *m, void *unused) static void print_binder_transaction_log_entry(struct seq_file *m, struct binder_transaction_log_entry *e) { + int debug_id = READ_ONCE(e->debug_id_done); + /* + * read barrier to guarantee debug_id_done read before + * we print the log values + */ + smp_rmb(); seq_printf(m, - "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d\n", + "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d", e->debug_id, (e->call_type == 2) ? "reply" : ((e->call_type == 1) ? "async" : "call "), e->from_proc, e->from_thread, e->to_proc, e->to_thread, e->context_name, e->to_node, e->target_handle, e->data_size, e->offsets_size, e->return_error, e->return_error_param, e->return_error_line); + /* + * read-barrier to guarantee read of debug_id_done after + * done printing the fields of the entry + */ + smp_rmb(); + seq_printf(m, debug_id && debug_id == READ_ONCE(e->debug_id_done) ? + "\n" : " (incomplete)\n"); } static int binder_transaction_log_show(struct seq_file *m, void *unused) { struct binder_transaction_log *log = m->private; + unsigned int log_cur = atomic_read(&log->cur); + unsigned int count; + unsigned int cur; int i; - if (log->full) { - for (i = log->next; i < ARRAY_SIZE(log->entry); i++) - print_binder_transaction_log_entry(m, &log->entry[i]); + count = log_cur + 1; + cur = count < ARRAY_SIZE(log->entry) && !log->full ? + 0 : count % ARRAY_SIZE(log->entry); + if (count > ARRAY_SIZE(log->entry) || log->full) + count = ARRAY_SIZE(log->entry); + for (i = 0; i < count; i++) { + unsigned int index = cur++ % ARRAY_SIZE(log->entry); + + print_binder_transaction_log_entry(m, &log->entry[index]); } - for (i = 0; i < log->next; i++) - print_binder_transaction_log_entry(m, &log->entry[i]); return 0; } @@ -3794,6 +3834,8 @@ static int __init binder_init(void) struct binder_device *device; struct hlist_node *tmp; + atomic_set(&binder_transaction_log.cur, ~0U); + atomic_set(&binder_transaction_log_failed.cur, ~0U); binder_deferred_workqueue = create_singlethread_workqueue("binder"); if (!binder_deferred_workqueue) return -ENOMEM; From 162735381739bb4f3e5ad6f35a6319d99ee7e697 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 30 Mar 2017 18:02:13 -0700 Subject: [PATCH 0917/3220] FROMLIST: binder: refactor binder_pop_transaction (from https://lkml.org/lkml/2017/6/29/754) binder_pop_transaction needs to be split into 2 pieces to to allow the proc lock to be held on entry to dequeue the transaction stack, but no lock when kfree'ing the transaction. Split into binder_pop_transaction_locked and binder_free_transaction (the actual locks are still to be added). Change-Id: I848ae994cc27b3cd083cff2dbd1071762784f4a3 Test: tested manually Signed-off-by: Todd Kjos --- drivers/android/binder.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 33d7e981324d..2006fb7cd69a 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -769,14 +769,16 @@ static int binder_dec_ref(struct binder_ref *ref, int strong) static void binder_pop_transaction(struct binder_thread *target_thread, struct binder_transaction *t) { - if (target_thread) { - BUG_ON(target_thread->transaction_stack != t); - BUG_ON(target_thread->transaction_stack->from != target_thread); - target_thread->transaction_stack = - target_thread->transaction_stack->from_parent; - t->from = NULL; - } - t->need_reply = 0; + BUG_ON(!target_thread); + BUG_ON(target_thread->transaction_stack != t); + BUG_ON(target_thread->transaction_stack->from != target_thread); + target_thread->transaction_stack = + target_thread->transaction_stack->from_parent; + t->from = NULL; +} + +static void binder_free_transaction(struct binder_transaction *t) +{ if (t->buffer) t->buffer->transaction = NULL; kfree(t); @@ -809,6 +811,7 @@ static void binder_send_failed_reply(struct binder_transaction *t, binder_pop_transaction(target_thread, t); target_thread->return_error = error_code; wake_up_interruptible(&target_thread->wait); + binder_free_transaction(t); } else { pr_err("reply failed, target thread, %d:%d, has error code %d already\n", target_thread->proc->pid, @@ -823,7 +826,7 @@ static void binder_send_failed_reply(struct binder_transaction *t, "send failed reply for transaction %d, target dead\n", t->debug_id); - binder_pop_transaction(target_thread, t); + binder_free_transaction(t); if (next == NULL) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "reply failed, no target thread at root\n"); @@ -1807,6 +1810,7 @@ static void binder_transaction(struct binder_proc *proc, if (reply) { BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction(target_thread, in_reply_to); + binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); t->need_reply = 1; @@ -2636,9 +2640,7 @@ retry: t->to_thread = thread; thread->transaction_stack = t; } else { - t->buffer->transaction = NULL; - kfree(t); - binder_stats_deleted(BINDER_STAT_TRANSACTION); + binder_free_transaction(t); } break; } @@ -2681,9 +2683,7 @@ static void binder_release_work(struct list_head *list) binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered transaction %d\n", t->debug_id); - t->buffer->transaction = NULL; - kfree(t); - binder_stats_deleted(BINDER_STAT_TRANSACTION); + binder_free_transaction(t); } } break; case BINDER_WORK_TRANSACTION_COMPLETE: { From 3a822b33c84bf30d2081ede2c42d53ae7ab89d85 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 21 Apr 2017 17:35:12 -0700 Subject: [PATCH 0918/3220] FROMLIST: binder: guarantee txn complete / errors delivered in-order (from https://patchwork.kernel.org/patch/9817805/) Since errors are tracked in the return_error/return_error2 fields of the binder_thread object and BR_TRANSACTION_COMPLETEs can be tracked either in those fields or via the thread todo work list, it is possible for errors to be reported ahead of the associated txn complete. Use the thread todo work list for errors to guarantee order. Also changed binder_send_failed_reply to pop the transaction even if it failed to send a reply. Bug: 37218618 Test: tested manually Change-Id: I196cfaeed09fdcd697f8ab25eea4e04241fdb08f Signed-off-by: Todd Kjos --- drivers/android/binder.c | 125 ++++++++++++++++++++++----------------- 1 file changed, 72 insertions(+), 53 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 2006fb7cd69a..f62919855936 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -249,6 +249,7 @@ struct binder_work { enum { BINDER_WORK_TRANSACTION = 1, BINDER_WORK_TRANSACTION_COMPLETE, + BINDER_WORK_RETURN_ERROR, BINDER_WORK_NODE, BINDER_WORK_DEAD_BINDER, BINDER_WORK_DEAD_BINDER_AND_CLEAR, @@ -256,6 +257,11 @@ struct binder_work { } type; }; +struct binder_error { + struct binder_work work; + uint32_t cmd; +}; + struct binder_node { int debug_id; struct binder_work work; @@ -350,10 +356,8 @@ struct binder_thread { bool looper_need_return; /* can be written by other thread */ struct binder_transaction *transaction_stack; struct list_head todo; - uint32_t return_error; /* Write failed, return error code in read buf */ - uint32_t return_error2; /* Write failed, return error code in read */ - /* buffer. Used when sending a reply to a dead process that */ - /* we are also waiting on */ + struct binder_error return_error; + struct binder_error reply_error; wait_queue_head_t wait; struct binder_stats stats; }; @@ -795,29 +799,24 @@ static void binder_send_failed_reply(struct binder_transaction *t, while (1) { target_thread = t->from; if (target_thread) { - if (target_thread->return_error != BR_OK && - target_thread->return_error2 == BR_OK) { - target_thread->return_error2 = - target_thread->return_error; - target_thread->return_error = BR_OK; - } - if (target_thread->return_error == BR_OK) { - binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "send failed reply for transaction %d to %d:%d\n", - t->debug_id, - target_thread->proc->pid, - target_thread->pid); + binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, + "send failed reply for transaction %d to %d:%d\n", + t->debug_id, + target_thread->proc->pid, + target_thread->pid); - binder_pop_transaction(target_thread, t); - target_thread->return_error = error_code; + binder_pop_transaction(target_thread, t); + if (target_thread->reply_error.cmd == BR_OK) { + target_thread->reply_error.cmd = error_code; + list_add_tail( + &target_thread->reply_error.work.entry, + &target_thread->todo); wake_up_interruptible(&target_thread->wait); - binder_free_transaction(t); } else { - pr_err("reply failed, target thread, %d:%d, has error code %d already\n", - target_thread->proc->pid, - target_thread->pid, - target_thread->return_error); + WARN(1, "Unexpected reply error: %u\n", + target_thread->reply_error.cmd); } + binder_free_transaction(t); return; } next = t->from_parent; @@ -1885,12 +1884,17 @@ err_no_context_mgr_node: WRITE_ONCE(fe->debug_id_done, t_debug_id); } - BUG_ON(thread->return_error != BR_OK); + BUG_ON(thread->return_error.cmd != BR_OK); if (in_reply_to) { - thread->return_error = BR_TRANSACTION_COMPLETE; + thread->return_error.cmd = BR_TRANSACTION_COMPLETE; + list_add_tail(&thread->return_error.work.entry, + &thread->todo); binder_send_failed_reply(in_reply_to, return_error); - } else - thread->return_error = return_error; + } else { + thread->return_error.cmd = return_error; + list_add_tail(&thread->return_error.work.entry, + &thread->todo); + } } static int binder_thread_write(struct binder_proc *proc, @@ -1904,7 +1908,7 @@ static int binder_thread_write(struct binder_proc *proc, void __user *ptr = buffer + *consumed; void __user *end = buffer + size; - while (ptr < end && thread->return_error == BR_OK) { + while (ptr < end && thread->return_error.cmd == BR_OK) { if (get_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); @@ -2184,7 +2188,12 @@ static int binder_thread_write(struct binder_proc *proc, } death = kzalloc(sizeof(*death), GFP_KERNEL); if (death == NULL) { - thread->return_error = BR_ERROR; + WARN_ON(thread->return_error.cmd != + BR_OK); + thread->return_error.cmd = BR_ERROR; + list_add_tail( + &thread->return_error.work.entry, + &thread->todo); binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", proc->pid, thread->pid); @@ -2300,8 +2309,7 @@ static int binder_has_proc_work(struct binder_proc *proc, static int binder_has_thread_work(struct binder_thread *thread) { - return !list_empty(&thread->todo) || thread->return_error != BR_OK || - thread->looper_need_return; + return !list_empty(&thread->todo) || thread->looper_need_return; } static int binder_put_node_cmd(struct binder_proc *proc, @@ -2357,25 +2365,6 @@ retry: wait_for_proc_work = thread->transaction_stack == NULL && list_empty(&thread->todo); - if (thread->return_error != BR_OK && ptr < end) { - if (thread->return_error2 != BR_OK) { - if (put_user(thread->return_error2, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - binder_stat_br(proc, thread, thread->return_error2); - if (ptr == end) - goto done; - thread->return_error2 = BR_OK; - } - if (put_user(thread->return_error, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - binder_stat_br(proc, thread, thread->return_error); - thread->return_error = BR_OK; - goto done; - } - - thread->looper |= BINDER_LOOPER_STATE_WAITING; if (wait_for_proc_work) proc->ready_threads++; @@ -2442,6 +2431,19 @@ retry: case BINDER_WORK_TRANSACTION: { t = container_of(w, struct binder_transaction, work); } break; + case BINDER_WORK_RETURN_ERROR: { + struct binder_error *e = container_of( + w, struct binder_error, work); + + WARN_ON(e->cmd == BR_OK); + if (put_user(e->cmd, (uint32_t __user *)ptr)) + return -EFAULT; + e->cmd = BR_OK; + ptr += sizeof(uint32_t); + + binder_stat_br(proc, thread, cmd); + list_del(&w->entry); + } break; case BINDER_WORK_TRANSACTION_COMPLETE: { cmd = BR_TRANSACTION_COMPLETE; if (put_user(cmd, (uint32_t __user *)ptr)) @@ -2686,6 +2688,14 @@ static void binder_release_work(struct list_head *list) binder_free_transaction(t); } } break; + case BINDER_WORK_RETURN_ERROR: { + struct binder_error *e = container_of( + w, struct binder_error, work); + + binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, + "undelivered TRANSACTION_ERROR: %u\n", + e->cmd); + } break; case BINDER_WORK_TRANSACTION_COMPLETE: { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_COMPLETE\n"); @@ -2741,8 +2751,10 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc) rb_link_node(&thread->rb_node, parent, p); rb_insert_color(&thread->rb_node, &proc->threads); thread->looper_need_return = true; - thread->return_error = BR_OK; - thread->return_error2 = BR_OK; + thread->return_error.work.type = BINDER_WORK_RETURN_ERROR; + thread->return_error.cmd = BR_OK; + thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR; + thread->reply_error.cmd = BR_OK; } return thread; } @@ -2800,7 +2812,7 @@ static unsigned int binder_poll(struct file *filp, thread = binder_get_thread(proc); wait_for_proc_work = thread->transaction_stack == NULL && - list_empty(&thread->todo) && thread->return_error == BR_OK; + list_empty(&thread->todo); binder_unlock(__func__); @@ -3379,6 +3391,13 @@ static void print_binder_work(struct seq_file *m, const char *prefix, t = container_of(w, struct binder_transaction, work); print_binder_transaction(m, transaction_prefix, t); break; + case BINDER_WORK_RETURN_ERROR: { + struct binder_error *e = container_of( + w, struct binder_error, work); + + seq_printf(m, "%stransaction error: %u\n", + prefix, e->cmd); + } break; case BINDER_WORK_TRANSACTION_COMPLETE: seq_printf(m, "%stransaction complete\n", prefix); break; From f80cbc72e1b892d9df17c84050891a6bb53cbeac Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 26 May 2017 11:56:29 -0700 Subject: [PATCH 0919/3220] FROMLIST: binder: make sure target_node has strong ref (from https://patchwork.kernel.org/patch/9817787/) When initiating a transaction, the target_node must have a strong ref on it. Then we take a second strong ref to make sure the node survives until the transaction is complete. Change-Id: If7429cb43eda520ab89d45df6c19327cee97c60c Signed-off-by: Todd Kjos --- drivers/android/binder.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index f62919855936..0dec6d6c66ca 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1470,8 +1470,19 @@ static void binder_transaction(struct binder_proc *proc, if (tr->target.handle) { struct binder_ref *ref; + /* + * There must already be a strong ref + * on this node. If so, do a strong + * increment on the node to ensure it + * stays alive until the transaction is + * done. + */ ref = binder_get_ref(proc, tr->target.handle, true); - if (ref == NULL) { + if (ref) { + binder_inc_node(ref->node, 1, 0, NULL); + target_node = ref->node; + } + if (target_node == NULL) { binder_user_error("%d:%d got transaction to invalid handle\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; @@ -1479,7 +1490,6 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_invalid_target_handle; } - target_node = ref->node; } else { mutex_lock(&context->context_mgr_node_lock); target_node = context->binder_context_mgr_node; @@ -1489,6 +1499,7 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_no_context_mgr_node; } + binder_inc_node(target_node, 1, 0, NULL); mutex_unlock(&context->context_mgr_node_lock); } e->to_node = target_node->debug_id; @@ -1609,9 +1620,6 @@ static void binder_transaction(struct binder_proc *proc, t->buffer->transaction = t; t->buffer->target_node = target_node; trace_binder_transaction_alloc_buf(t->buffer); - if (target_node) - binder_inc_node(target_node, 1, 0, NULL); - off_start = (binder_size_t *)(t->buffer->data + ALIGN(tr->data_size, sizeof(void *))); offp = off_start; @@ -1847,6 +1855,7 @@ err_bad_parent: err_copy_data_failed: trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, t->buffer, offp); + target_node = NULL; t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); err_binder_alloc_buf_failed: @@ -1861,6 +1870,9 @@ err_empty_call_stack: err_dead_binder: err_invalid_target_handle: err_no_context_mgr_node: + if (target_node) + binder_dec_node(target_node, 1, 0); + binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n", proc->pid, thread->pid, return_error, return_error_param, From e482ec39d635cd1bcf764f2aefbc4ab3149d2299 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 12 May 2017 14:42:55 -0700 Subject: [PATCH 0920/3220] FROMLIST: binder: make sure accesses to proc/thread are safe (from https://patchwork.kernel.org/patch/9817787/) binder_thread and binder_proc may be accessed by other threads when processing transaction. Therefore they must be prevented from being freed while a transaction is in progress that references them. This is done by introducing a temporary reference counter for threads and procs that indicates that the object is in use and must not be freed. binder_thread_dec_tmpref() and binder_proc_dec_tmpref() are used to decrement the temporary reference. It is safe to free a binder_thread if there is no reference and it has been released (indicated by thread->is_dead). It is safe to free a binder_proc if it has no remaining threads and no reference. A spinlock is added to the binder_transaction to safely access and set references for t->from and for debug code to safely access t->to_thread and t->to_proc. Change-Id: I0a00a0294c3e93aea8b3f141c6f18e77ad244078 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 233 ++++++++++++++++++++++++++++++++++----- 1 file changed, 206 insertions(+), 27 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 0dec6d6c66ca..a16375e133cc 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -325,6 +325,7 @@ struct binder_proc { struct files_struct *files; struct hlist_node deferred_work_node; int deferred_work; + bool is_dead; struct list_head todo; wait_queue_head_t wait; @@ -334,6 +335,7 @@ struct binder_proc { int requested_threads; int requested_threads_started; int ready_threads; + int tmp_ref; long default_priority; struct dentry *debugfs_entry; struct binder_alloc alloc; @@ -360,6 +362,8 @@ struct binder_thread { struct binder_error reply_error; wait_queue_head_t wait; struct binder_stats stats; + atomic_t tmp_ref; + bool is_dead; }; struct binder_transaction { @@ -379,10 +383,19 @@ struct binder_transaction { long priority; long saved_priority; kuid_t sender_euid; + /** + * @lock: protects @from, @to_proc, and @to_thread + * + * @from, @to_proc, and @to_thread can be set to NULL + * during thread teardown + */ + spinlock_t lock; }; static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); +static void binder_free_thread(struct binder_thread *thread); +static void binder_free_proc(struct binder_proc *proc); static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) { @@ -781,6 +794,79 @@ static void binder_pop_transaction(struct binder_thread *target_thread, t->from = NULL; } +/** + * binder_thread_dec_tmpref() - decrement thread->tmp_ref + * @thread: thread to decrement + * + * A thread needs to be kept alive while being used to create or + * handle a transaction. binder_get_txn_from() is used to safely + * extract t->from from a binder_transaction and keep the thread + * indicated by t->from from being freed. When done with that + * binder_thread, this function is called to decrement the + * tmp_ref and free if appropriate (thread has been released + * and no transaction being processed by the driver) + */ +static void binder_thread_dec_tmpref(struct binder_thread *thread) +{ + /* + * atomic is used to protect the counter value while + * it cannot reach zero or thread->is_dead is false + * + * TODO: future patch adds locking to ensure that the + * check of tmp_ref and is_dead is done with a lock held + */ + atomic_dec(&thread->tmp_ref); + if (thread->is_dead && !atomic_read(&thread->tmp_ref)) { + binder_free_thread(thread); + return; + } +} + +/** + * binder_proc_dec_tmpref() - decrement proc->tmp_ref + * @proc: proc to decrement + * + * A binder_proc needs to be kept alive while being used to create or + * handle a transaction. proc->tmp_ref is incremented when + * creating a new transaction or the binder_proc is currently in-use + * by threads that are being released. When done with the binder_proc, + * this function is called to decrement the counter and free the + * proc if appropriate (proc has been released, all threads have + * been released and not currenly in-use to process a transaction). + */ +static void binder_proc_dec_tmpref(struct binder_proc *proc) +{ + proc->tmp_ref--; + if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) && + !proc->tmp_ref) { + binder_free_proc(proc); + return; + } +} + +/** + * binder_get_txn_from() - safely extract the "from" thread in transaction + * @t: binder transaction for t->from + * + * Atomically return the "from" thread and increment the tmp_ref + * count for the thread to ensure it stays alive until + * binder_thread_dec_tmpref() is called. + * + * Return: the value of t->from + */ +static struct binder_thread *binder_get_txn_from( + struct binder_transaction *t) +{ + struct binder_thread *from; + + spin_lock(&t->lock); + from = t->from; + if (from) + atomic_inc(&from->tmp_ref); + spin_unlock(&t->lock); + return from; +} + static void binder_free_transaction(struct binder_transaction *t) { if (t->buffer) @@ -797,7 +883,7 @@ static void binder_send_failed_reply(struct binder_transaction *t, BUG_ON(t->flags & TF_ONE_WAY); while (1) { - target_thread = t->from; + target_thread = binder_get_txn_from(t); if (target_thread) { binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "send failed reply for transaction %d to %d:%d\n", @@ -816,6 +902,7 @@ static void binder_send_failed_reply(struct binder_transaction *t, WARN(1, "Unexpected reply error: %u\n", target_thread->reply_error.cmd); } + binder_thread_dec_tmpref(target_thread); binder_free_transaction(t); return; } @@ -1396,7 +1483,7 @@ static void binder_transaction(struct binder_proc *proc, binder_size_t *offp, *off_end, *off_start; binder_size_t off_min; u8 *sg_bufp, *sg_buf_end; - struct binder_proc *target_proc; + struct binder_proc *target_proc = NULL; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; struct list_head *target_list; @@ -1433,12 +1520,14 @@ static void binder_transaction(struct binder_proc *proc, } binder_set_nice(in_reply_to->saved_priority); if (in_reply_to->to_thread != thread) { + spin_lock(&in_reply_to->lock); binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n", proc->pid, thread->pid, in_reply_to->debug_id, in_reply_to->to_proc ? in_reply_to->to_proc->pid : 0, in_reply_to->to_thread ? in_reply_to->to_thread->pid : 0); + spin_unlock(&in_reply_to->lock); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; @@ -1446,7 +1535,7 @@ static void binder_transaction(struct binder_proc *proc, goto err_bad_call_stack; } thread->transaction_stack = in_reply_to->to_parent; - target_thread = in_reply_to->from; + target_thread = binder_get_txn_from(in_reply_to); if (target_thread == NULL) { return_error = BR_DEAD_REPLY; return_error_line = __LINE__; @@ -1466,6 +1555,7 @@ static void binder_transaction(struct binder_proc *proc, goto err_dead_binder; } target_proc = target_thread->proc; + target_proc->tmp_ref++; } else { if (tr->target.handle) { struct binder_ref *ref; @@ -1509,6 +1599,7 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_dead_binder; } + target_proc->tmp_ref++; if (security_binder_transaction(proc->tsk, target_proc->tsk) < 0) { return_error = BR_FAILED_REPLY; @@ -1521,19 +1612,30 @@ static void binder_transaction(struct binder_proc *proc, tmp = thread->transaction_stack; if (tmp->to_thread != thread) { + spin_lock(&tmp->lock); binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n", proc->pid, thread->pid, tmp->debug_id, tmp->to_proc ? tmp->to_proc->pid : 0, tmp->to_thread ? tmp->to_thread->pid : 0); + spin_unlock(&tmp->lock); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_bad_call_stack; } while (tmp) { - if (tmp->from && tmp->from->proc == target_proc) - target_thread = tmp->from; + struct binder_thread *from; + + spin_lock(&tmp->lock); + from = tmp->from; + if (from && from->proc == target_proc) { + atomic_inc(&from->tmp_ref); + target_thread = from; + spin_unlock(&tmp->lock); + break; + } + spin_unlock(&tmp->lock); tmp = tmp->from_parent; } } @@ -1557,6 +1659,7 @@ static void binder_transaction(struct binder_proc *proc, goto err_alloc_t_failed; } binder_stats_created(BINDER_STAT_TRANSACTION); + spin_lock_init(&t->lock); tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); if (tcomplete == NULL) { @@ -1815,6 +1918,8 @@ static void binder_transaction(struct binder_proc *proc, list_add_tail(&tcomplete->entry, &thread->todo); if (reply) { + if (target_thread->is_dead) + goto err_dead_proc_or_thread; BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction(target_thread, in_reply_to); binder_free_transaction(in_reply_to); @@ -1823,6 +1928,11 @@ static void binder_transaction(struct binder_proc *proc, t->need_reply = 1; t->from_parent = thread->transaction_stack; thread->transaction_stack = t; + if (target_proc->is_dead || + (target_thread && target_thread->is_dead)) { + binder_pop_transaction(thread, t); + goto err_dead_proc_or_thread; + } } else { BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); @@ -1831,6 +1941,9 @@ static void binder_transaction(struct binder_proc *proc, target_wait = NULL; } else target_node->has_async_transaction = 1; + if (target_proc->is_dead || + (target_thread && target_thread->is_dead)) + goto err_dead_proc_or_thread; } t->work.type = BINDER_WORK_TRANSACTION; list_add_tail(&t->work.entry, target_list); @@ -1840,6 +1953,9 @@ static void binder_transaction(struct binder_proc *proc, else wake_up_interruptible(target_wait); } + if (target_thread) + binder_thread_dec_tmpref(target_thread); + binder_proc_dec_tmpref(target_proc); /* * write barrier to synchronize with initialization * of log entry @@ -1848,6 +1964,9 @@ static void binder_transaction(struct binder_proc *proc, WRITE_ONCE(e->debug_id_done, t_debug_id); return; +err_dead_proc_or_thread: + return_error = BR_DEAD_REPLY; + return_error_line = __LINE__; err_translate_failed: err_bad_object_type: err_bad_offset: @@ -1870,6 +1989,10 @@ err_empty_call_stack: err_dead_binder: err_invalid_target_handle: err_no_context_mgr_node: + if (target_thread) + binder_thread_dec_tmpref(target_thread); + if (target_proc) + binder_proc_dec_tmpref(target_proc); if (target_node) binder_dec_node(target_node, 1, 0); @@ -2422,6 +2545,7 @@ retry: struct binder_transaction_data tr; struct binder_work *w; struct binder_transaction *t = NULL; + struct binder_thread *t_from; if (!list_empty(&thread->todo)) { w = list_first_entry(&thread->todo, struct binder_work, @@ -2610,8 +2734,9 @@ retry: tr.flags = t->flags; tr.sender_euid = from_kuid(current_user_ns(), t->sender_euid); - if (t->from) { - struct task_struct *sender = t->from->proc->tsk; + t_from = binder_get_txn_from(t); + if (t_from) { + struct task_struct *sender = t_from->proc->tsk; tr.sender_pid = task_tgid_nr_ns(sender, task_active_pid_ns(current)); @@ -2628,11 +2753,17 @@ retry: ALIGN(t->buffer->data_size, sizeof(void *)); - if (put_user(cmd, (uint32_t __user *)ptr)) + if (put_user(cmd, (uint32_t __user *)ptr)) { + if (t_from) + binder_thread_dec_tmpref(t_from); return -EFAULT; + } ptr += sizeof(uint32_t); - if (copy_to_user(ptr, &tr, sizeof(tr))) + if (copy_to_user(ptr, &tr, sizeof(tr))) { + if (t_from) + binder_thread_dec_tmpref(t_from); return -EFAULT; + } ptr += sizeof(tr); trace_binder_transaction_received(t); @@ -2642,11 +2773,13 @@ retry: proc->pid, thread->pid, (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" : "BR_REPLY", - t->debug_id, t->from ? t->from->proc->pid : 0, - t->from ? t->from->pid : 0, cmd, + t->debug_id, t_from ? t_from->proc->pid : 0, + t_from ? t_from->pid : 0, cmd, t->buffer->data_size, t->buffer->offsets_size, (u64)tr.data.ptr.buffer, (u64)tr.data.ptr.offsets); + if (t_from) + binder_thread_dec_tmpref(t_from); list_del(&t->work.entry); t->buffer->allow_user_free = 1; if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) { @@ -2758,6 +2891,7 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc) binder_stats_created(BINDER_STAT_THREAD); thread->proc = proc; thread->pid = current->pid; + atomic_set(&thread->tmp_ref, 0); init_waitqueue_head(&thread->wait); INIT_LIST_HEAD(&thread->todo); rb_link_node(&thread->rb_node, parent, p); @@ -2771,18 +2905,55 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc) return thread; } -static int binder_free_thread(struct binder_proc *proc, - struct binder_thread *thread) +static void binder_free_proc(struct binder_proc *proc) +{ + BUG_ON(!list_empty(&proc->todo)); + BUG_ON(!list_empty(&proc->delivered_death)); + binder_alloc_deferred_release(&proc->alloc); + put_task_struct(proc->tsk); + binder_stats_deleted(BINDER_STAT_PROC); + kfree(proc); +} + +static void binder_free_thread(struct binder_thread *thread) +{ + BUG_ON(!list_empty(&thread->todo)); + binder_stats_deleted(BINDER_STAT_THREAD); + binder_proc_dec_tmpref(thread->proc); + kfree(thread); +} + +static int binder_thread_release(struct binder_proc *proc, + struct binder_thread *thread) { struct binder_transaction *t; struct binder_transaction *send_reply = NULL; int active_transactions = 0; + struct binder_transaction *last_t = NULL; + /* + * take a ref on the proc so it survives + * after we remove this thread from proc->threads. + * The corresponding dec is when we actually + * free the thread in binder_free_thread() + */ + proc->tmp_ref++; + /* + * take a ref on this thread to ensure it + * survives while we are releasing it + */ + atomic_inc(&thread->tmp_ref); rb_erase(&thread->rb_node, &proc->threads); t = thread->transaction_stack; - if (t && t->to_thread == thread) - send_reply = t; + if (t) { + spin_lock(&t->lock); + if (t->to_thread == thread) + send_reply = t; + } + thread->is_dead = true; + while (t) { + last_t = t; active_transactions++; binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "release %d:%d transaction %d %s, still active\n", @@ -2803,12 +2974,15 @@ static int binder_free_thread(struct binder_proc *proc, t = t->from_parent; } else BUG(); + spin_unlock(&last_t->lock); + if (t) + spin_lock(&t->lock); } + if (send_reply) binder_send_failed_reply(send_reply, BR_DEAD_REPLY); binder_release_work(&thread->todo); - kfree(thread); - binder_stats_deleted(BINDER_STAT_THREAD); + binder_thread_dec_tmpref(thread); return active_transactions; } @@ -2996,7 +3170,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case BINDER_THREAD_EXIT: binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n", proc->pid, thread->pid); - binder_free_thread(proc, thread); + binder_thread_release(proc, thread); thread = NULL; break; case BINDER_VERSION: { @@ -3266,7 +3440,13 @@ static void binder_deferred_release(struct binder_proc *proc) context->binder_context_mgr_node = NULL; } mutex_unlock(&context->context_mgr_node_lock); + /* + * Make sure proc stays alive after we + * remove all the threads + */ + proc->tmp_ref++; + proc->is_dead = true; threads = 0; active_transactions = 0; while ((n = rb_first(&proc->threads))) { @@ -3274,7 +3454,7 @@ static void binder_deferred_release(struct binder_proc *proc) thread = rb_entry(n, struct binder_thread, rb_node); threads++; - active_transactions += binder_free_thread(proc, thread); + active_transactions += binder_thread_release(proc, thread); } nodes = 0; @@ -3300,17 +3480,12 @@ static void binder_deferred_release(struct binder_proc *proc) binder_release_work(&proc->todo); binder_release_work(&proc->delivered_death); - binder_alloc_deferred_release(&proc->alloc); - binder_stats_deleted(BINDER_STAT_PROC); - - put_task_struct(proc->tsk); - binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n", __func__, proc->pid, threads, nodes, incoming_refs, outgoing_refs, active_transactions); - kfree(proc); + binder_proc_dec_tmpref(proc); } static void binder_deferred_func(struct work_struct *work) @@ -3371,6 +3546,7 @@ binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer) static void print_binder_transaction(struct seq_file *m, const char *prefix, struct binder_transaction *t) { + spin_lock(&t->lock); seq_printf(m, "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d", prefix, t->debug_id, t, @@ -3379,6 +3555,8 @@ static void print_binder_transaction(struct seq_file *m, const char *prefix, t->to_proc ? t->to_proc->pid : 0, t->to_thread ? t->to_thread->pid : 0, t->code, t->flags, t->priority, t->need_reply); + spin_unlock(&t->lock); + if (t->buffer == NULL) { seq_puts(m, " buffer free\n"); return; @@ -3443,9 +3621,10 @@ static void print_binder_thread(struct seq_file *m, size_t start_pos = m->count; size_t header_pos; - seq_printf(m, " thread %d: l %02x need_return %d\n", + seq_printf(m, " thread %d: l %02x need_return %d tr %d\n", thread->pid, thread->looper, - thread->looper_need_return); + thread->looper_need_return, + atomic_read(&thread->tmp_ref)); header_pos = m->count; t = thread->transaction_stack; while (t) { From f7d874123e4de442dcd6aca8afe0fd0b9d2854b4 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 8 May 2017 09:16:27 -0700 Subject: [PATCH 0921/3220] FROMLIST: binder: refactor binder ref inc/dec for thread safety (from https://patchwork.kernel.org/patch/9817781/) Once locks are added, binder_ref's will only be accessed safely with the proc lock held. Refactor the inc/dec paths to make them atomic with the binder_get_ref* paths and node inc/dec. For example, instead of: ref = binder_get_ref(proc, handle, strong); ... binder_dec_ref(ref, strong); we now have: ret = binder_dec_ref_for_handle(proc, handle, strong, &rdata); Since the actual ref is no longer exposed to callers, a new struct binder_ref_data is introduced which can be used to return a copy of ref state. Change-Id: I7de22107f8ebc967cee63251d584fceb4ea56250 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 484 ++++++++++++++++++++++++--------- drivers/android/binder_trace.h | 32 ++- 2 files changed, 379 insertions(+), 137 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index a16375e133cc..b7109a356c46 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -291,20 +291,51 @@ struct binder_ref_death { binder_uintptr_t cookie; }; +/** + * struct binder_ref_data - binder_ref counts and id + * @debug_id: unique ID for the ref + * @desc: unique userspace handle for ref + * @strong: strong ref count (debugging only if not locked) + * @weak: weak ref count (debugging only if not locked) + * + * Structure to hold ref count and ref id information. Since + * the actual ref can only be accessed with a lock, this structure + * is used to return information about the ref to callers of + * ref inc/dec functions. + */ +struct binder_ref_data { + int debug_id; + uint32_t desc; + int strong; + int weak; +}; + +/** + * struct binder_ref - struct to track references on nodes + * @data: binder_ref_data containing id, handle, and current refcounts + * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree + * @rb_node_node: node for lookup by @node in proc's rb_tree + * @node_entry: list entry for node->refs list in target node + * @proc: binder_proc containing ref + * @node: binder_node of target node. When cleaning up a + * ref for deletion in binder_cleanup_ref, a non-NULL + * @node indicates the node must be freed + * @death: pointer to death notification (ref_death) if requested + * + * Structure to track references from procA to target node (on procB). This + * structure is unsafe to access without holding @proc->outer_lock. + */ struct binder_ref { /* Lookups needed: */ /* node + proc => ref (transaction) */ /* desc + proc => ref (transaction, inc/dec ref) */ /* node => refs + procs (proc exit) */ - int debug_id; + struct binder_ref_data data; struct rb_node rb_node_desc; struct rb_node rb_node_node; struct hlist_node node_entry; struct binder_proc *proc; struct binder_node *node; - uint32_t desc; - int strong; - int weak; struct binder_ref_death *death; }; @@ -628,11 +659,11 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc, while (n) { ref = rb_entry(n, struct binder_ref, rb_node_desc); - if (desc < ref->desc) { + if (desc < ref->data.desc) { n = n->rb_left; - } else if (desc > ref->desc) { + } else if (desc > ref->data.desc) { n = n->rb_right; - } else if (need_strong_ref && !ref->strong) { + } else if (need_strong_ref && !ref->data.strong) { binder_user_error("tried to use weak ref as strong ref\n"); return NULL; } else { @@ -642,14 +673,33 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc, return NULL; } +/** + * binder_get_ref_for_node() - get the ref associated with given node + * @proc: binder_proc that owns the ref + * @node: binder_node of target + * @new_ref: newly allocated binder_ref to be initialized or %NULL + * + * Look up the ref for the given node and return it if it exists + * + * If it doesn't exist and the caller provides a newly allocated + * ref, initialize the fields of the newly allocated ref and insert + * into the given proc rb_trees and node refs list. + * + * Return: the ref for node. It is possible that another thread + * allocated/initialized the ref first in which case the + * returned ref would be different than the passed-in + * new_ref. new_ref must be kfree'd by the caller in + * this case. + */ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, - struct binder_node *node) + struct binder_node *node, + struct binder_ref *new_ref) { - struct rb_node *n; + struct binder_context *context = proc->context; struct rb_node **p = &proc->refs_by_node.rb_node; struct rb_node *parent = NULL; - struct binder_ref *ref, *new_ref; - struct binder_context *context = proc->context; + struct binder_ref *ref; + struct rb_node *n; while (*p) { parent = *p; @@ -662,22 +712,22 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, else return ref; } - new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); - if (new_ref == NULL) + if (!new_ref) return NULL; + binder_stats_created(BINDER_STAT_REF); - new_ref->debug_id = atomic_inc_return(&binder_last_id); + new_ref->data.debug_id = atomic_inc_return(&binder_last_id); new_ref->proc = proc; new_ref->node = node; rb_link_node(&new_ref->rb_node_node, parent, p); rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node); - new_ref->desc = (node == context->binder_context_mgr_node) ? 0 : 1; + new_ref->data.desc = (node == context->binder_context_mgr_node) ? 0 : 1; for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { ref = rb_entry(n, struct binder_ref, rb_node_desc); - if (ref->desc > new_ref->desc) + if (ref->data.desc > new_ref->data.desc) break; - new_ref->desc = ref->desc + 1; + new_ref->data.desc = ref->data.desc + 1; } p = &proc->refs_by_desc.rb_node; @@ -685,9 +735,9 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, parent = *p; ref = rb_entry(parent, struct binder_ref, rb_node_desc); - if (new_ref->desc < ref->desc) + if (new_ref->data.desc < ref->data.desc) p = &(*p)->rb_left; - else if (new_ref->desc > ref->desc) + else if (new_ref->data.desc > ref->data.desc) p = &(*p)->rb_right; else BUG(); @@ -698,89 +748,267 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d new ref %d desc %d for node %d\n", - proc->pid, new_ref->debug_id, new_ref->desc, + proc->pid, new_ref->data.debug_id, new_ref->data.desc, node->debug_id); return new_ref; } -static void binder_delete_ref(struct binder_ref *ref) +static void binder_cleanup_ref(struct binder_ref *ref) { binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d delete ref %d desc %d for node %d\n", - ref->proc->pid, ref->debug_id, ref->desc, + ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->node->debug_id); rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc); rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node); - if (ref->strong) + + if (ref->data.strong) binder_dec_node(ref->node, 1, 1); + hlist_del(&ref->node_entry); binder_dec_node(ref->node, 0, 1); + if (ref->death) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "%d delete ref %d desc %d has death notification\n", - ref->proc->pid, ref->debug_id, ref->desc); + ref->proc->pid, ref->data.debug_id, + ref->data.desc); list_del(&ref->death->work.entry); - kfree(ref->death); binder_stats_deleted(BINDER_STAT_DEATH); } - kfree(ref); binder_stats_deleted(BINDER_STAT_REF); } +/** + * binder_inc_ref() - increment the ref for given handle + * @ref: ref to be incremented + * @strong: if true, strong increment, else weak + * @target_list: list to queue node work on + * + * Increment the ref. + * + * Return: 0, if successful, else errno + */ static int binder_inc_ref(struct binder_ref *ref, int strong, struct list_head *target_list) { int ret; if (strong) { - if (ref->strong == 0) { + if (ref->data.strong == 0) { ret = binder_inc_node(ref->node, 1, 1, target_list); if (ret) return ret; } - ref->strong++; + ref->data.strong++; } else { - if (ref->weak == 0) { + if (ref->data.weak == 0) { ret = binder_inc_node(ref->node, 0, 1, target_list); if (ret) return ret; } - ref->weak++; + ref->data.weak++; } return 0; } - -static int binder_dec_ref(struct binder_ref *ref, int strong) +/** + * binder_dec_ref() - dec the ref for given handle + * @ref: ref to be decremented + * @strong: if true, strong decrement, else weak + * + * Decrement the ref. + * + * TODO: kfree is avoided here since an upcoming patch + * will put this under a lock. + * + * Return: true if ref is cleaned up and ready to be freed + */ +static bool binder_dec_ref(struct binder_ref *ref, int strong) { if (strong) { - if (ref->strong == 0) { + if (ref->data.strong == 0) { binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n", - ref->proc->pid, ref->debug_id, - ref->desc, ref->strong, ref->weak); - return -EINVAL; + ref->proc->pid, ref->data.debug_id, + ref->data.desc, ref->data.strong, + ref->data.weak); + return false; } - ref->strong--; - if (ref->strong == 0) { + ref->data.strong--; + if (ref->data.strong == 0) { int ret; ret = binder_dec_node(ref->node, strong, 1); if (ret) - return ret; + return false; } } else { - if (ref->weak == 0) { + if (ref->data.weak == 0) { binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n", - ref->proc->pid, ref->debug_id, - ref->desc, ref->strong, ref->weak); - return -EINVAL; + ref->proc->pid, ref->data.debug_id, + ref->data.desc, ref->data.strong, + ref->data.weak); + return false; } - ref->weak--; + ref->data.weak--; } - if (ref->strong == 0 && ref->weak == 0) - binder_delete_ref(ref); - return 0; + if (ref->data.strong == 0 && ref->data.weak == 0) { + binder_cleanup_ref(ref); + /* + * TODO: we could kfree(ref) here, but an upcoming + * patch will call this with a lock held, so we + * return an indication that the ref should be + * freed. + */ + return true; + } + return false; +} + +/** + * binder_get_node_from_ref() - get the node from the given proc/desc + * @proc: proc containing the ref + * @desc: the handle associated with the ref + * @need_strong_ref: if true, only return node if ref is strong + * @rdata: the id/refcount data for the ref + * + * Given a proc and ref handle, return the associated binder_node + * + * Return: a binder_node or NULL if not found or not strong when strong required + */ +static struct binder_node *binder_get_node_from_ref( + struct binder_proc *proc, + u32 desc, bool need_strong_ref, + struct binder_ref_data *rdata) +{ + struct binder_node *node; + struct binder_ref *ref; + + ref = binder_get_ref(proc, desc, need_strong_ref); + if (!ref) + goto err_no_ref; + node = ref->node; + if (rdata) + *rdata = ref->data; + + return node; + +err_no_ref: + return NULL; +} + +/** + * binder_free_ref() - free the binder_ref + * @ref: ref to free + * + * Free the binder_ref and the binder_ref_death indicated by ref->death. + */ +static void binder_free_ref(struct binder_ref *ref) +{ + kfree(ref->death); + kfree(ref); +} + +/** + * binder_update_ref_for_handle() - inc/dec the ref for given handle + * @proc: proc containing the ref + * @desc: the handle associated with the ref + * @increment: true=inc reference, false=dec reference + * @strong: true=strong reference, false=weak reference + * @rdata: the id/refcount data for the ref + * + * Given a proc and ref handle, increment or decrement the ref + * according to "increment" arg. + * + * Return: 0 if successful, else errno + */ +static int binder_update_ref_for_handle(struct binder_proc *proc, + uint32_t desc, bool increment, bool strong, + struct binder_ref_data *rdata) +{ + int ret = 0; + struct binder_ref *ref; + bool delete_ref = false; + + ref = binder_get_ref(proc, desc, strong); + if (!ref) { + ret = -EINVAL; + goto err_no_ref; + } + if (increment) + ret = binder_inc_ref(ref, strong, NULL); + else + delete_ref = binder_dec_ref(ref, strong); + + if (rdata) + *rdata = ref->data; + + if (delete_ref) + binder_free_ref(ref); + return ret; + +err_no_ref: + return ret; +} + +/** + * binder_dec_ref_for_handle() - dec the ref for given handle + * @proc: proc containing the ref + * @desc: the handle associated with the ref + * @strong: true=strong reference, false=weak reference + * @rdata: the id/refcount data for the ref + * + * Just calls binder_update_ref_for_handle() to decrement the ref. + * + * Return: 0 if successful, else errno + */ +static int binder_dec_ref_for_handle(struct binder_proc *proc, + uint32_t desc, bool strong, struct binder_ref_data *rdata) +{ + return binder_update_ref_for_handle(proc, desc, false, strong, rdata); +} + + +/** + * binder_inc_ref_for_node() - increment the ref for given proc/node + * @proc: proc containing the ref + * @node: target node + * @strong: true=strong reference, false=weak reference + * @target_list: worklist to use if node is incremented + * @rdata: the id/refcount data for the ref + * + * Given a proc and node, increment the ref. Create the ref if it + * doesn't already exist + * + * Return: 0 if successful, else errno + */ +static int binder_inc_ref_for_node(struct binder_proc *proc, + struct binder_node *node, + bool strong, + struct list_head *target_list, + struct binder_ref_data *rdata) +{ + struct binder_ref *ref; + struct binder_ref *new_ref = NULL; + int ret = 0; + + ref = binder_get_ref_for_node(proc, node, NULL); + if (!ref) { + new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!new_ref) + return -ENOMEM; + ref = binder_get_ref_for_node(proc, node, new_ref); + } + ret = binder_inc_ref(ref, strong, target_list); + *rdata = ref->data; + if (new_ref && ref != new_ref) + /* + * Another thread created the ref first so + * free the one we allocated + */ + kfree(new_ref); + return ret; } static void binder_pop_transaction(struct binder_thread *target_thread, @@ -1125,20 +1353,21 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; - struct binder_ref *ref; + struct binder_ref_data rdata; + int ret; fp = to_flat_binder_object(hdr); - ref = binder_get_ref(proc, fp->handle, - hdr->type == BINDER_TYPE_HANDLE); - if (ref == NULL) { - pr_err("transaction release %d bad handle %d\n", - debug_id, fp->handle); + ret = binder_dec_ref_for_handle(proc, fp->handle, + hdr->type == BINDER_TYPE_HANDLE, &rdata); + + if (ret) { + pr_err("transaction release %d bad handle %d, ret = %d\n", + debug_id, fp->handle, ret); break; } binder_debug(BINDER_DEBUG_TRANSACTION, - " ref %d desc %d (node %d)\n", - ref->debug_id, ref->desc, ref->node->debug_id); - binder_dec_ref(ref, hdr->type == BINDER_TYPE_HANDLE); + " ref %d desc %d\n", + rdata.debug_id, rdata.desc); } break; case BINDER_TYPE_FD: { @@ -1210,9 +1439,10 @@ static int binder_translate_binder(struct flat_binder_object *fp, struct binder_thread *thread) { struct binder_node *node; - struct binder_ref *ref; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; + struct binder_ref_data rdata; + int ret; node = binder_get_node(proc, fp->binder); if (!node) { @@ -1233,25 +1463,25 @@ static int binder_translate_binder(struct flat_binder_object *fp, if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) return -EPERM; - ref = binder_get_ref_for_node(target_proc, node); - if (!ref) - return -ENOMEM; + ret = binder_inc_ref_for_node(target_proc, node, + fp->hdr.type == BINDER_TYPE_BINDER, + &thread->todo, &rdata); + if (ret) + return ret; if (fp->hdr.type == BINDER_TYPE_BINDER) fp->hdr.type = BINDER_TYPE_HANDLE; else fp->hdr.type = BINDER_TYPE_WEAK_HANDLE; fp->binder = 0; - fp->handle = ref->desc; + fp->handle = rdata.desc; fp->cookie = 0; - binder_inc_ref(ref, fp->hdr.type == BINDER_TYPE_HANDLE, &thread->todo); - trace_binder_transaction_node_to_ref(t, node, ref); + trace_binder_transaction_node_to_ref(t, node, &rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx -> ref %d desc %d\n", node->debug_id, (u64)node->ptr, - ref->debug_id, ref->desc); - + rdata.debug_id, rdata.desc); return 0; } @@ -1259,13 +1489,14 @@ static int binder_translate_handle(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) { - struct binder_ref *ref; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; + struct binder_node *node; + struct binder_ref_data src_rdata; - ref = binder_get_ref(proc, fp->handle, - fp->hdr.type == BINDER_TYPE_HANDLE); - if (!ref) { + node = binder_get_node_from_ref(proc, fp->handle, + fp->hdr.type == BINDER_TYPE_HANDLE, &src_rdata); + if (!node) { binder_user_error("%d:%d got transaction with invalid handle, %d\n", proc->pid, thread->pid, fp->handle); return -EINVAL; @@ -1273,37 +1504,41 @@ static int binder_translate_handle(struct flat_binder_object *fp, if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) return -EPERM; - if (ref->node->proc == target_proc) { + if (node->proc == target_proc) { if (fp->hdr.type == BINDER_TYPE_HANDLE) fp->hdr.type = BINDER_TYPE_BINDER; else fp->hdr.type = BINDER_TYPE_WEAK_BINDER; - fp->binder = ref->node->ptr; - fp->cookie = ref->node->cookie; - binder_inc_node(ref->node, fp->hdr.type == BINDER_TYPE_BINDER, + fp->binder = node->ptr; + fp->cookie = node->cookie; + binder_inc_node(node, + fp->hdr.type == BINDER_TYPE_BINDER, 0, NULL); - trace_binder_transaction_ref_to_node(t, ref); + trace_binder_transaction_ref_to_node(t, node, &src_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> node %d u%016llx\n", - ref->debug_id, ref->desc, ref->node->debug_id, - (u64)ref->node->ptr); + src_rdata.debug_id, src_rdata.desc, node->debug_id, + (u64)node->ptr); } else { - struct binder_ref *new_ref; + int ret; + struct binder_ref_data dest_rdata; - new_ref = binder_get_ref_for_node(target_proc, ref->node); - if (!new_ref) - return -ENOMEM; + ret = binder_inc_ref_for_node(target_proc, node, + fp->hdr.type == BINDER_TYPE_HANDLE, + NULL, &dest_rdata); + if (ret) + return ret; fp->binder = 0; - fp->handle = new_ref->desc; + fp->handle = dest_rdata.desc; fp->cookie = 0; - binder_inc_ref(new_ref, fp->hdr.type == BINDER_TYPE_HANDLE, - NULL); - trace_binder_transaction_ref_to_ref(t, ref, new_ref); + trace_binder_transaction_ref_to_ref(t, node, &src_rdata, + &dest_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> ref %d desc %d (node %d)\n", - ref->debug_id, ref->desc, new_ref->debug_id, - new_ref->desc, ref->node->debug_id); + src_rdata.debug_id, src_rdata.desc, + dest_rdata.debug_id, dest_rdata.desc, + node->debug_id); } return 0; } @@ -2044,6 +2279,8 @@ static int binder_thread_write(struct binder_proc *proc, void __user *end = buffer + size; while (ptr < end && thread->return_error.cmd == BR_OK) { + int ret; + if (get_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); @@ -2059,62 +2296,61 @@ static int binder_thread_write(struct binder_proc *proc, case BC_RELEASE: case BC_DECREFS: { uint32_t target; - struct binder_ref *ref = NULL; const char *debug_string; + bool strong = cmd == BC_ACQUIRE || cmd == BC_RELEASE; + bool increment = cmd == BC_INCREFS || cmd == BC_ACQUIRE; + struct binder_ref_data rdata; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); - if (target == 0 && - (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) { + ret = -1; + if (increment && !target) { struct binder_node *ctx_mgr_node; - mutex_lock(&context->context_mgr_node_lock); ctx_mgr_node = context->binder_context_mgr_node; - if (ctx_mgr_node) { - ref = binder_get_ref_for_node(proc, - ctx_mgr_node); - if (ref && ref->desc != target) { - binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n", - proc->pid, thread->pid, - ref->desc); - } - } + if (ctx_mgr_node) + ret = binder_inc_ref_for_node( + proc, ctx_mgr_node, + strong, NULL, &rdata); mutex_unlock(&context->context_mgr_node_lock); } - if (ref == NULL) - ref = binder_get_ref(proc, target, - cmd == BC_ACQUIRE || - cmd == BC_RELEASE); - if (ref == NULL) { - binder_user_error("%d:%d refcount change on invalid ref %d\n", - proc->pid, thread->pid, target); - break; + if (ret) + ret = binder_update_ref_for_handle( + proc, target, increment, strong, + &rdata); + if (!ret && rdata.desc != target) { + binder_user_error("%d:%d tried to acquire reference to desc %d, got %d instead\n", + proc->pid, thread->pid, + target, rdata.desc); } switch (cmd) { case BC_INCREFS: debug_string = "IncRefs"; - binder_inc_ref(ref, 0, NULL); break; case BC_ACQUIRE: debug_string = "Acquire"; - binder_inc_ref(ref, 1, NULL); break; case BC_RELEASE: debug_string = "Release"; - binder_dec_ref(ref, 1); break; case BC_DECREFS: default: debug_string = "DecRefs"; - binder_dec_ref(ref, 0); + break; + } + if (ret) { + binder_user_error("%d:%d %s %d refcount change on invalid ref %d ret %d\n", + proc->pid, thread->pid, debug_string, + strong, target, ret); break; } binder_debug(BINDER_DEBUG_USER_REFS, - "%d:%d %s ref %d desc %d s %d w %d for node %d\n", - proc->pid, thread->pid, debug_string, ref->debug_id, - ref->desc, ref->strong, ref->weak, ref->node->debug_id); + "%d:%d %s ref %d desc %d s %d w %d\n", + proc->pid, thread->pid, debug_string, + rdata.debug_id, rdata.desc, rdata.strong, + rdata.weak); break; } case BC_INCREFS_DONE: @@ -2312,8 +2548,9 @@ static int binder_thread_write(struct binder_proc *proc, cmd == BC_REQUEST_DEATH_NOTIFICATION ? "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", - (u64)cookie, ref->debug_id, ref->desc, - ref->strong, ref->weak, ref->node->debug_id); + (u64)cookie, ref->data.debug_id, + ref->data.desc, ref->data.strong, + ref->data.weak, ref->node->debug_id); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { if (ref->death) { @@ -3474,7 +3711,8 @@ static void binder_deferred_release(struct binder_proc *proc) ref = rb_entry(n, struct binder_ref, rb_node_desc); outgoing_refs++; - binder_delete_ref(ref); + binder_cleanup_ref(ref); + binder_free_ref(ref); } binder_release_work(&proc->todo); @@ -3676,9 +3914,11 @@ static void print_binder_node(struct seq_file *m, struct binder_node *node) static void print_binder_ref(struct seq_file *m, struct binder_ref *ref) { - seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %p\n", - ref->debug_id, ref->desc, ref->node->proc ? "" : "dead ", - ref->node->debug_id, ref->strong, ref->weak, ref->death); + seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %pK\n", + ref->data.debug_id, ref->data.desc, + ref->node->proc ? "" : "dead ", + ref->node->debug_id, ref->data.strong, + ref->data.weak, ref->death); } static void print_binder_proc(struct seq_file *m, @@ -3845,8 +4085,8 @@ static void print_binder_proc_stats(struct seq_file *m, struct binder_ref *ref = rb_entry(n, struct binder_ref, rb_node_desc); count++; - strong += ref->strong; - weak += ref->weak; + strong += ref->data.strong; + weak += ref->data.weak; } seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak); diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index 50b0d21f42cf..7967db16ba5a 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -24,7 +24,7 @@ struct binder_buffer; struct binder_node; struct binder_proc; struct binder_alloc; -struct binder_ref; +struct binder_ref_data; struct binder_thread; struct binder_transaction; @@ -147,8 +147,8 @@ TRACE_EVENT(binder_transaction_received, TRACE_EVENT(binder_transaction_node_to_ref, TP_PROTO(struct binder_transaction *t, struct binder_node *node, - struct binder_ref *ref), - TP_ARGS(t, node, ref), + struct binder_ref_data *rdata), + TP_ARGS(t, node, rdata), TP_STRUCT__entry( __field(int, debug_id) @@ -161,8 +161,8 @@ TRACE_EVENT(binder_transaction_node_to_ref, __entry->debug_id = t->debug_id; __entry->node_debug_id = node->debug_id; __entry->node_ptr = node->ptr; - __entry->ref_debug_id = ref->debug_id; - __entry->ref_desc = ref->desc; + __entry->ref_debug_id = rdata->debug_id; + __entry->ref_desc = rdata->desc; ), TP_printk("transaction=%d node=%d src_ptr=0x%016llx ==> dest_ref=%d dest_desc=%d", __entry->debug_id, __entry->node_debug_id, @@ -171,8 +171,9 @@ TRACE_EVENT(binder_transaction_node_to_ref, ); TRACE_EVENT(binder_transaction_ref_to_node, - TP_PROTO(struct binder_transaction *t, struct binder_ref *ref), - TP_ARGS(t, ref), + TP_PROTO(struct binder_transaction *t, struct binder_node *node, + struct binder_ref_data *rdata), + TP_ARGS(t, node, rdata), TP_STRUCT__entry( __field(int, debug_id) @@ -183,10 +184,10 @@ TRACE_EVENT(binder_transaction_ref_to_node, ), TP_fast_assign( __entry->debug_id = t->debug_id; - __entry->ref_debug_id = ref->debug_id; - __entry->ref_desc = ref->desc; - __entry->node_debug_id = ref->node->debug_id; - __entry->node_ptr = ref->node->ptr; + __entry->ref_debug_id = rdata->debug_id; + __entry->ref_desc = rdata->desc; + __entry->node_debug_id = node->debug_id; + __entry->node_ptr = node->ptr; ), TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ptr=0x%016llx", __entry->debug_id, __entry->node_debug_id, @@ -195,9 +196,10 @@ TRACE_EVENT(binder_transaction_ref_to_node, ); TRACE_EVENT(binder_transaction_ref_to_ref, - TP_PROTO(struct binder_transaction *t, struct binder_ref *src_ref, - struct binder_ref *dest_ref), - TP_ARGS(t, src_ref, dest_ref), + TP_PROTO(struct binder_transaction *t, struct binder_node *node, + struct binder_ref_data *src_ref, + struct binder_ref_data *dest_ref), + TP_ARGS(t, node, src_ref, dest_ref), TP_STRUCT__entry( __field(int, debug_id) @@ -209,7 +211,7 @@ TRACE_EVENT(binder_transaction_ref_to_ref, ), TP_fast_assign( __entry->debug_id = t->debug_id; - __entry->node_debug_id = src_ref->node->debug_id; + __entry->node_debug_id = node->debug_id; __entry->src_ref_debug_id = src_ref->debug_id; __entry->src_ref_desc = src_ref->desc; __entry->dest_ref_debug_id = dest_ref->debug_id; From 96dd75d9917402f0e3a8243c6152b2e5d662c37f Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Tue, 9 May 2017 11:08:05 -0700 Subject: [PATCH 0922/3220] FROMLIST: binder: use node->tmp_refs to ensure node safety (from https://patchwork.kernel.org/patch/9817795/) When obtaining a node via binder_get_node(), binder_get_node_from_ref() or binder_new_node(), increment node->tmp_refs to take a temporary reference on the node to ensure the node persists while being used. binder_put_node() must be called to remove the temporary reference. Change-Id: I962b39d5cd80b2d7e4786bb87236ede7914e2db7 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 124 ++++++++++++++++++++++++++++++++------- 1 file changed, 104 insertions(+), 20 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index b7109a356c46..4c636da38b2b 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -274,6 +274,7 @@ struct binder_node { int internal_strong_refs; int local_weak_refs; int local_strong_refs; + int tmp_refs; binder_uintptr_t ptr; binder_uintptr_t cookie; unsigned has_strong_ref:1; @@ -427,6 +428,7 @@ static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); static void binder_free_thread(struct binder_thread *thread); static void binder_free_proc(struct binder_proc *proc); +static void binder_inc_node_tmpref(struct binder_node *node); static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) { @@ -521,8 +523,15 @@ static struct binder_node *binder_get_node(struct binder_proc *proc, n = n->rb_left; else if (ptr > node->ptr) n = n->rb_right; - else + else { + /* + * take an implicit weak reference + * to ensure node stays alive until + * call to binder_put_node() + */ + binder_inc_node_tmpref(node); return node; + } } return NULL; } @@ -551,6 +560,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, if (node == NULL) return NULL; binder_stats_created(BINDER_STAT_NODE); + node->tmp_refs++; rb_link_node(&node->rb_node, parent, p); rb_insert_color(&node->rb_node, &proc->nodes); node->debug_id = atomic_inc_return(&binder_last_id); @@ -616,7 +626,8 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal) } else { if (!internal) node->local_weak_refs--; - if (node->local_weak_refs || !hlist_empty(&node->refs)) + if (node->local_weak_refs || node->tmp_refs || + !hlist_empty(&node->refs)) return 0; } if (node->proc && (node->has_strong_ref || node->has_weak_ref)) { @@ -626,7 +637,7 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal) } } else { if (hlist_empty(&node->refs) && !node->local_strong_refs && - !node->local_weak_refs) { + !node->local_weak_refs && !node->tmp_refs) { list_del_init(&node->work.entry); if (node->proc) { rb_erase(&node->rb_node, &node->proc->nodes); @@ -649,6 +660,46 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal) return 0; } +/** + * binder_inc_node_tmpref() - take a temporary reference on node + * @node: node to reference + * + * Take reference on node to prevent the node from being freed + * while referenced only by a local variable + */ +static void binder_inc_node_tmpref(struct binder_node *node) +{ + /* + * No call to binder_inc_node() is needed since we + * don't need to inform userspace of any changes to + * tmp_refs + */ + node->tmp_refs++; +} + +/** + * binder_dec_node_tmpref() - remove a temporary reference on node + * @node: node to reference + * + * Release temporary reference on node taken via binder_inc_node_tmpref() + */ +static void binder_dec_node_tmpref(struct binder_node *node) +{ + node->tmp_refs--; + BUG_ON(node->tmp_refs < 0); + /* + * Call binder_dec_node() to check if all refcounts are 0 + * and cleanup is needed. Calling with strong=0 and internal=1 + * causes no actual reference to be released in binder_dec_node(). + * If that changes, a change is needed here too. + */ + binder_dec_node(node, 0, 1); +} + +static void binder_put_node(struct binder_node *node) +{ + binder_dec_node_tmpref(node); +} static struct binder_ref *binder_get_ref(struct binder_proc *proc, u32 desc, bool need_strong_ref) @@ -889,6 +940,11 @@ static struct binder_node *binder_get_node_from_ref( if (!ref) goto err_no_ref; node = ref->node; + /* + * Take an implicit reference on the node to ensure + * it stays alive until the call to binder_put_node() + */ + binder_inc_node_tmpref(node); if (rdata) *rdata = ref->data; @@ -1349,6 +1405,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, node->debug_id, (u64)node->ptr); binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER, 0); + binder_put_node(node); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { @@ -1442,7 +1499,7 @@ static int binder_translate_binder(struct flat_binder_object *fp, struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_ref_data rdata; - int ret; + int ret = 0; node = binder_get_node(proc, fp->binder); if (!node) { @@ -1458,16 +1515,19 @@ static int binder_translate_binder(struct flat_binder_object *fp, proc->pid, thread->pid, (u64)fp->binder, node->debug_id, (u64)fp->cookie, (u64)node->cookie); - return -EINVAL; + ret = -EINVAL; + goto done; + } + if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) { + ret = -EPERM; + goto done; } - if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) - return -EPERM; ret = binder_inc_ref_for_node(target_proc, node, fp->hdr.type == BINDER_TYPE_BINDER, &thread->todo, &rdata); if (ret) - return ret; + goto done; if (fp->hdr.type == BINDER_TYPE_BINDER) fp->hdr.type = BINDER_TYPE_HANDLE; @@ -1482,7 +1542,9 @@ static int binder_translate_binder(struct flat_binder_object *fp, " node %d u%016llx -> ref %d desc %d\n", node->debug_id, (u64)node->ptr, rdata.debug_id, rdata.desc); - return 0; +done: + binder_put_node(node); + return ret; } static int binder_translate_handle(struct flat_binder_object *fp, @@ -1493,6 +1555,7 @@ static int binder_translate_handle(struct flat_binder_object *fp, struct binder_proc *target_proc = t->to_proc; struct binder_node *node; struct binder_ref_data src_rdata; + int ret = 0; node = binder_get_node_from_ref(proc, fp->handle, fp->hdr.type == BINDER_TYPE_HANDLE, &src_rdata); @@ -1501,8 +1564,10 @@ static int binder_translate_handle(struct flat_binder_object *fp, proc->pid, thread->pid, fp->handle); return -EINVAL; } - if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) - return -EPERM; + if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) { + ret = -EPERM; + goto done; + } if (node->proc == target_proc) { if (fp->hdr.type == BINDER_TYPE_HANDLE) @@ -1527,7 +1592,7 @@ static int binder_translate_handle(struct flat_binder_object *fp, fp->hdr.type == BINDER_TYPE_HANDLE, NULL, &dest_rdata); if (ret) - return ret; + goto done; fp->binder = 0; fp->handle = dest_rdata.desc; @@ -1540,7 +1605,9 @@ static int binder_translate_handle(struct flat_binder_object *fp, dest_rdata.debug_id, dest_rdata.desc, node->debug_id); } - return 0; +done: + binder_put_node(node); + return ret; } static int binder_translate_fd(int fd, @@ -2382,6 +2449,7 @@ static int binder_thread_write(struct binder_proc *proc, "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", (u64)node_ptr, node->debug_id, (u64)cookie, (u64)node->cookie); + binder_put_node(node); break; } if (cmd == BC_ACQUIRE_DONE) { @@ -2389,6 +2457,7 @@ static int binder_thread_write(struct binder_proc *proc, binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n", proc->pid, thread->pid, node->debug_id); + binder_put_node(node); break; } node->pending_strong_ref = 0; @@ -2397,16 +2466,19 @@ static int binder_thread_write(struct binder_proc *proc, binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n", proc->pid, thread->pid, node->debug_id); + binder_put_node(node); break; } node->pending_weak_ref = 0; } binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0); binder_debug(BINDER_DEBUG_USER_REFS, - "%d:%d %s node %d ls %d lw %d\n", + "%d:%d %s node %d ls %d lw %d tr %d\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", - node->debug_id, node->local_strong_refs, node->local_weak_refs); + node->debug_id, node->local_strong_refs, + node->local_weak_refs, node->tmp_refs); + binder_put_node(node); break; } case BC_ATTEMPT_ACQUIRE: @@ -2846,7 +2918,8 @@ retry: strong = node->internal_strong_refs || node->local_strong_refs; weak = !hlist_empty(&node->refs) || - node->local_weak_refs || strong; + node->local_weak_refs || + node->tmp_refs || strong; has_strong_ref = node->has_strong_ref; has_weak_ref = node->has_weak_ref; @@ -3358,6 +3431,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) new_node->has_strong_ref = 1; new_node->has_weak_ref = 1; context->binder_context_mgr_node = new_node; + binder_put_node(new_node); out: mutex_unlock(&context->context_mgr_node_lock); return ret; @@ -3616,8 +3690,11 @@ static int binder_node_release(struct binder_node *node, int refs) list_del_init(&node->work.entry); binder_release_work(&node->async_todo); - - if (hlist_empty(&node->refs)) { + /* + * The caller must have taken a temporary ref on the node, + */ + BUG_ON(!node->tmp_refs); + if (hlist_empty(&node->refs) && node->tmp_refs == 1) { kfree(node); binder_stats_deleted(BINDER_STAT_NODE); @@ -3652,6 +3729,7 @@ static int binder_node_release(struct binder_node *node, int refs) binder_debug(BINDER_DEBUG_DEAD_BINDER, "node %d now dead, refs %d, death %d\n", node->debug_id, refs, death); + binder_put_node(node); return refs; } @@ -3701,6 +3779,12 @@ static void binder_deferred_release(struct binder_proc *proc) node = rb_entry(n, struct binder_node, rb_node); nodes++; + /* + * take a temporary ref on the node before + * calling binder_node_release() which will either + * kfree() the node or call binder_put_node() + */ + binder_inc_node_tmpref(node); rb_erase(&node->rb_node, &proc->nodes); incoming_refs = binder_node_release(node, incoming_refs); } @@ -3896,11 +3980,11 @@ static void print_binder_node(struct seq_file *m, struct binder_node *node) hlist_for_each_entry(ref, &node->refs, node_entry) count++; - seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d", + seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d", node->debug_id, (u64)node->ptr, (u64)node->cookie, node->has_strong_ref, node->has_weak_ref, node->local_strong_refs, node->local_weak_refs, - node->internal_strong_refs, count); + node->internal_strong_refs, count, node->tmp_refs); if (count) { seq_puts(m, " proc"); hlist_for_each_entry(ref, &node->refs, node_entry) From b0f59d6d045c31a10cc0175fdb34ba5ae067b5b2 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 29 May 2017 16:44:24 -0700 Subject: [PATCH 0923/3220] FROMLIST: binder: introduce locking helper functions (from https://patchwork.kernel.org/patch/9817791/) There are 3 main spinlocks which must be acquired in this order: 1) proc->outer_lock : protects most fields of binder_proc, binder_thread, and binder_ref structures. binder_proc_lock() and binder_proc_unlock() are used to acq/rel. 2) node->lock : protects most fields of binder_node. binder_node_lock() and binder_node_unlock() are used to acq/rel 3) proc->inner_lock : protects the thread and node lists (proc->threads, proc->nodes) and all todo lists associated with the binder_proc (proc->todo, thread->todo, proc->delivered_death and node->async_todo). binder_inner_proc_lock() and binder_inner_proc_unlock() are used to acq/rel Any lock under procA must never be nested under any lock at the same level or below on procB. Functions that require a lock held on entry indicate which lock in the suffix of the function name: foo_olocked() : requires node->outer_lock foo_nlocked() : requires node->lock foo_ilocked() : requires proc->inner_lock foo_iolocked(): requires proc->outer_lock and proc->inner_lock foo_nilocked(): requires node->lock and proc->inner_lock Change-Id: Ied42674486092a0e3bdde64356e45b2494844558 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 238 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 4c636da38b2b..86eec8db3b4f 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -15,6 +15,39 @@ * */ +/* + * Locking overview + * + * There are 3 main spinlocks which must be acquired in the + * order shown: + * + * 1) proc->outer_lock : protects binder_ref + * binder_proc_lock() and binder_proc_unlock() are + * used to acq/rel. + * 2) node->lock : protects most fields of binder_node. + * binder_node_lock() and binder_node_unlock() are + * used to acq/rel + * 3) proc->inner_lock : protects the thread and node lists + * (proc->threads, proc->nodes) and all todo lists associated + * with the binder_proc (proc->todo, thread->todo, + * proc->delivered_death and node->async_todo). + * binder_inner_proc_lock() and binder_inner_proc_unlock() + * are used to acq/rel + * + * Any lock under procA must never be nested under any lock at the same + * level or below on procB. + * + * Functions that require a lock held on entry indicate which lock + * in the suffix of the function name: + * + * foo_olocked() : requires node->outer_lock + * foo_nlocked() : requires node->lock + * foo_ilocked() : requires proc->inner_lock + * foo_oilocked(): requires proc->outer_lock and proc->inner_lock + * foo_nilocked(): requires node->lock and proc->inner_lock + * ... + */ + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -35,6 +68,7 @@ #include #include #include +#include #ifdef CONFIG_ANDROID_BINDER_IPC_32BIT #define BINDER_IPC_32BIT 1 @@ -106,6 +140,7 @@ enum { BINDER_DEBUG_FREE_BUFFER = 1U << 11, BINDER_DEBUG_INTERNAL_REFS = 1U << 12, BINDER_DEBUG_PRIORITY_CAP = 1U << 13, + BINDER_DEBUG_SPINLOCKS = 1U << 14, }; static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; @@ -262,8 +297,43 @@ struct binder_error { uint32_t cmd; }; +/** + * struct binder_node - binder node bookkeeping + * @debug_id: unique ID for debugging + * (invariant after initialized) + * @lock: lock for node fields + * @work: worklist element for node work + * @rb_node: element for proc->nodes tree + * @dead_node: element for binder_dead_nodes list + * (protected by binder_dead_nodes_lock) + * @proc: binder_proc that owns this node + * (invariant after initialized) + * @refs: list of references on this node + * @internal_strong_refs: used to take strong references when + * initiating a transaction + * @local_weak_refs: weak user refs from local process + * @local_strong_refs: strong user refs from local process + * @tmp_refs: temporary kernel refs + * @ptr: userspace pointer for node + * (invariant, no lock needed) + * @cookie: userspace cookie for node + * (invariant, no lock needed) + * @has_strong_ref: userspace notified of strong ref + * @pending_strong_ref: userspace has acked notification of strong ref + * @has_weak_ref: userspace notified of weak ref + * @pending_weak_ref: userspace has acked notification of weak ref + * @has_async_transaction: async transaction to node in progress + * @accept_fds: file descriptor operations supported for node + * (invariant after initialized) + * @min_priority: minimum scheduling priority + * (invariant after initialized) + * @async_todo: list of async work items + * + * Bookkeeping structure for binder nodes. + */ struct binder_node { int debug_id; + spinlock_t lock; struct binder_work work; union { struct rb_node rb_node; @@ -346,6 +416,51 @@ enum binder_deferred_state { BINDER_DEFERRED_RELEASE = 0x04, }; +/** + * struct binder_proc - binder process bookkeeping + * @proc_node: element for binder_procs list + * @threads: rbtree of binder_threads in this proc + * @nodes: rbtree of binder nodes associated with + * this proc ordered by node->ptr + * @refs_by_desc: rbtree of refs ordered by ref->desc + * @refs_by_node: rbtree of refs ordered by ref->node + * @pid PID of group_leader of process + * (invariant after initialized) + * @tsk task_struct for group_leader of process + * (invariant after initialized) + * @files files_struct for process + * (invariant after initialized) + * @deferred_work_node: element for binder_deferred_list + * (protected by binder_deferred_lock) + * @deferred_work: bitmap of deferred work to perform + * (protected by binder_deferred_lock) + * @is_dead: process is dead and awaiting free + * when outstanding transactions are cleaned up + * @todo: list of work for this process + * @wait: wait queue head to wait for proc work + * (invariant after initialized) + * @stats: per-process binder statistics + * (atomics, no lock needed) + * @delivered_death: list of delivered death notification + * @max_threads: cap on number of binder threads + * @requested_threads: number of binder threads requested but not + * yet started. In current implementation, can + * only be 0 or 1. + * @requested_threads_started: number binder threads started + * @ready_threads: number of threads waiting for proc work + * @tmp_ref: temporary reference to indicate proc is in use + * @default_priority: default scheduler priority + * (invariant after initialized) + * @debugfs_entry: debugfs node + * @alloc: binder allocator bookkeeping + * @context: binder_context for this proc + * (invariant after initialized) + * @inner_lock: can nest under outer_lock and/or node lock + * @outer_lock: no nesting under innor or node lock + * Lock order: 1) outer, 2) node, 3) inner + * + * Bookkeeping structure for binder processes + */ struct binder_proc { struct hlist_node proc_node; struct rb_root threads; @@ -372,6 +487,8 @@ struct binder_proc { struct dentry *debugfs_entry; struct binder_alloc alloc; struct binder_context *context; + spinlock_t inner_lock; + spinlock_t outer_lock; }; enum { @@ -382,6 +499,33 @@ enum { BINDER_LOOPER_STATE_WAITING = 0x10, }; +/** + * struct binder_thread - binder thread bookkeeping + * @proc: binder process for this thread + * (invariant after initialization) + * @rb_node: element for proc->threads rbtree + * @pid: PID for this thread + * (invariant after initialization) + * @looper: bitmap of looping state + * (only accessed by this thread) + * @looper_needs_return: looping thread needs to exit driver + * (no lock needed) + * @transaction_stack: stack of in-progress transactions for this thread + * @todo: list of work to do for this thread + * @return_error: transaction errors reported by this thread + * (only accessed by this thread) + * @reply_error: transaction errors reported by target thread + * @wait: wait queue for thread work + * @stats: per-thread statistics + * (atomics, no lock needed) + * @tmp_ref: temporary reference to indicate thread is in use + * (atomic since @proc->inner_lock cannot + * always be acquired) + * @is_dead: thread is dead and awaiting free + * when outstanding transactions are cleaned up + * + * Bookkeeping structure for binder threads. + */ struct binder_thread { struct binder_proc *proc; struct rb_node rb_node; @@ -424,6 +568,97 @@ struct binder_transaction { spinlock_t lock; }; +/** + * binder_proc_lock() - Acquire outer lock for given binder_proc + * @proc: struct binder_proc to acquire + * + * Acquires proc->outer_lock. Used to protect binder_ref + * structures associated with the given proc. + */ +#define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__) +static void +_binder_proc_lock(struct binder_proc *proc, int line) +{ + binder_debug(BINDER_DEBUG_SPINLOCKS, + "%s: line=%d\n", __func__, line); + spin_lock(&proc->outer_lock); +} + +/** + * binder_proc_unlock() - Release spinlock for given binder_proc + * @proc: struct binder_proc to acquire + * + * Release lock acquired via binder_proc_lock() + */ +#define binder_proc_unlock(_proc) _binder_proc_unlock(_proc, __LINE__) +static void +_binder_proc_unlock(struct binder_proc *proc, int line) +{ + binder_debug(BINDER_DEBUG_SPINLOCKS, + "%s: line=%d\n", __func__, line); + spin_unlock(&proc->outer_lock); +} + +/** + * binder_inner_proc_lock() - Acquire inner lock for given binder_proc + * @proc: struct binder_proc to acquire + * + * Acquires proc->inner_lock. Used to protect todo lists + */ +#define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__) +static void +_binder_inner_proc_lock(struct binder_proc *proc, int line) +{ + binder_debug(BINDER_DEBUG_SPINLOCKS, + "%s: line=%d\n", __func__, line); + spin_lock(&proc->inner_lock); +} + +/** + * binder_inner_proc_unlock() - Release inner lock for given binder_proc + * @proc: struct binder_proc to acquire + * + * Release lock acquired via binder_inner_proc_lock() + */ +#define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__) +static void +_binder_inner_proc_unlock(struct binder_proc *proc, int line) +{ + binder_debug(BINDER_DEBUG_SPINLOCKS, + "%s: line=%d\n", __func__, line); + spin_unlock(&proc->inner_lock); +} + +/** + * binder_node_lock() - Acquire spinlock for given binder_node + * @node: struct binder_node to acquire + * + * Acquires node->lock. Used to protect binder_node fields + */ +#define binder_node_lock(node) _binder_node_lock(node, __LINE__) +static void +_binder_node_lock(struct binder_node *node, int line) +{ + binder_debug(BINDER_DEBUG_SPINLOCKS, + "%s: line=%d\n", __func__, line); + spin_lock(&node->lock); +} + +/** + * binder_node_unlock() - Release spinlock for given binder_proc + * @node: struct binder_node to acquire + * + * Release lock acquired via binder_node_lock() + */ +#define binder_node_unlock(node) _binder_node_unlock(node, __LINE__) +static void +_binder_node_unlock(struct binder_node *node, int line) +{ + binder_debug(BINDER_DEBUG_SPINLOCKS, + "%s: line=%d\n", __func__, line); + spin_unlock(&node->lock); +} + static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); static void binder_free_thread(struct binder_thread *thread); @@ -568,6 +803,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, node->ptr = ptr; node->cookie = cookie; node->work.type = BINDER_WORK_NODE; + spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); INIT_LIST_HEAD(&node->async_todo); binder_debug(BINDER_DEBUG_INTERNAL_REFS, @@ -3600,6 +3836,8 @@ static int binder_open(struct inode *nodp, struct file *filp) proc = kzalloc(sizeof(*proc), GFP_KERNEL); if (proc == NULL) return -ENOMEM; + spin_lock_init(&proc->inner_lock); + spin_lock_init(&proc->outer_lock); get_task_struct(current->group_leader); proc->tsk = current->group_leader; INIT_LIST_HEAD(&proc->todo); From f73f378b52e81acdb9654a5bf81661d9ae461e62 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Tue, 21 Mar 2017 13:06:01 -0700 Subject: [PATCH 0924/3220] FROMLIST: binder: use inner lock to sync work dq and node counts (from https://patchwork.kernel.org/patch/9817789/) For correct behavior we need to hold the inner lock when dequeuing and processing node work in binder_thread_read. We now hold the inner lock when we enter the switch statement and release it after processing anything that might be affected by other threads. We also need to hold the inner lock to protect the node weak/strong ref tracking fields as long as node->proc is non-NULL (if it is NULL then we are guaranteed that we don't have any node work queued). This means that other functions that manipulate these fields must hold the inner lock. Refactored these functions to use the inner lock. Change-Id: I02c5cfdd3ab6dadea7f07f2a275faf3e27be77ad Test: tested manually Signed-off-by: Todd Kjos --- drivers/android/binder.c | 253 +++++++++++++++++++++++++++++++-------- 1 file changed, 200 insertions(+), 53 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 86eec8db3b4f..0bc8fe034234 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -311,17 +311,36 @@ struct binder_error { * @refs: list of references on this node * @internal_strong_refs: used to take strong references when * initiating a transaction + * (protected by @proc->inner_lock if @proc + * and by @lock) * @local_weak_refs: weak user refs from local process + * (protected by @proc->inner_lock if @proc + * and by @lock) * @local_strong_refs: strong user refs from local process + * (protected by @proc->inner_lock if @proc + * and by @lock) * @tmp_refs: temporary kernel refs + * (protected by @proc->inner_lock while @proc + * is valid, and by binder_dead_nodes_lock + * if @proc is NULL. During inc/dec and node release + * it is also protected by @lock to provide safety + * as the node dies and @proc becomes NULL) * @ptr: userspace pointer for node * (invariant, no lock needed) * @cookie: userspace cookie for node * (invariant, no lock needed) * @has_strong_ref: userspace notified of strong ref + * (protected by @proc->inner_lock if @proc + * and by @lock) * @pending_strong_ref: userspace has acked notification of strong ref + * (protected by @proc->inner_lock if @proc + * and by @lock) * @has_weak_ref: userspace notified of weak ref + * (protected by @proc->inner_lock if @proc + * and by @lock) * @pending_weak_ref: userspace has acked notification of weak ref + * (protected by @proc->inner_lock if @proc + * and by @lock) * @has_async_transaction: async transaction to node in progress * @accept_fds: file descriptor operations supported for node * (invariant after initialized) @@ -347,13 +366,24 @@ struct binder_node { int tmp_refs; binder_uintptr_t ptr; binder_uintptr_t cookie; - unsigned has_strong_ref:1; - unsigned pending_strong_ref:1; - unsigned has_weak_ref:1; - unsigned pending_weak_ref:1; - unsigned has_async_transaction:1; - unsigned accept_fds:1; - unsigned min_priority:8; + struct { + /* + * bitfield elements protected by + * proc inner_lock + */ + u8 has_strong_ref:1; + u8 pending_strong_ref:1; + u8 has_weak_ref:1; + u8 pending_weak_ref:1; + }; + struct { + /* + * invariant after initialization + */ + u8 accept_fds:1; + u8 min_priority; + }; + bool has_async_transaction; struct list_head async_todo; }; @@ -813,9 +843,18 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, return node; } -static int binder_inc_node(struct binder_node *node, int strong, int internal, - struct list_head *target_list) +static void binder_free_node(struct binder_node *node) { + kfree(node); + binder_stats_deleted(BINDER_STAT_NODE); +} + +static int binder_inc_node_ilocked(struct binder_node *node, int strong, + int internal, + struct list_head *target_list) +{ + if (node->proc) + BUG_ON(!spin_is_locked(&node->proc->inner_lock)); if (strong) { if (internal) { if (target_list == NULL && @@ -850,23 +889,43 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal, return 0; } -static int binder_dec_node(struct binder_node *node, int strong, int internal) +static int binder_inc_node(struct binder_node *node, int strong, int internal, + struct list_head *target_list) { + int ret; + + if (node->proc) + binder_inner_proc_lock(node->proc); + ret = binder_inc_node_ilocked(node, strong, internal, target_list); + if (node->proc) + binder_inner_proc_unlock(node->proc); + + return ret; +} + +static bool binder_dec_node_ilocked(struct binder_node *node, + int strong, int internal) +{ + struct binder_proc *proc = node->proc; + + if (proc) + BUG_ON(!spin_is_locked(&proc->inner_lock)); if (strong) { if (internal) node->internal_strong_refs--; else node->local_strong_refs--; if (node->local_strong_refs || node->internal_strong_refs) - return 0; + return false; } else { if (!internal) node->local_weak_refs--; if (node->local_weak_refs || node->tmp_refs || !hlist_empty(&node->refs)) - return 0; + return false; } - if (node->proc && (node->has_strong_ref || node->has_weak_ref)) { + + if (proc && (node->has_strong_ref || node->has_weak_ref)) { if (list_empty(&node->work.entry)) { list_add_tail(&node->work.entry, &node->proc->todo); wake_up_interruptible(&node->proc->wait); @@ -875,35 +934,48 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal) if (hlist_empty(&node->refs) && !node->local_strong_refs && !node->local_weak_refs && !node->tmp_refs) { list_del_init(&node->work.entry); - if (node->proc) { + if (proc) { rb_erase(&node->rb_node, &node->proc->nodes); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "refless node %d deleted\n", node->debug_id); } else { spin_lock(&binder_dead_nodes_lock); + /* + * tmp_refs could have changed so + * check it again + */ + if (node->tmp_refs) { + spin_unlock(&binder_dead_nodes_lock); + return false; + } hlist_del(&node->dead_node); spin_unlock(&binder_dead_nodes_lock); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "dead node %d deleted\n", node->debug_id); } - kfree(node); - binder_stats_deleted(BINDER_STAT_NODE); + return true; } } - - return 0; + return false; } -/** - * binder_inc_node_tmpref() - take a temporary reference on node - * @node: node to reference - * - * Take reference on node to prevent the node from being freed - * while referenced only by a local variable - */ -static void binder_inc_node_tmpref(struct binder_node *node) +static void binder_dec_node(struct binder_node *node, int strong, int internal) +{ + bool free_node; + + if (node->proc) + binder_inner_proc_lock(node->proc); + free_node = binder_dec_node_ilocked(node, strong, internal); + if (node->proc) + binder_inner_proc_unlock(node->proc); + + if (free_node) + binder_free_node(node); +} + +static void binder_inc_node_tmpref_ilocked(struct binder_node *node) { /* * No call to binder_inc_node() is needed since we @@ -913,6 +985,32 @@ static void binder_inc_node_tmpref(struct binder_node *node) node->tmp_refs++; } +/** + * binder_inc_node_tmpref() - take a temporary reference on node + * @node: node to reference + * + * Take reference on node to prevent the node from being freed + * while referenced only by a local variable. The inner lock is + * needed to serialize with the node work on the queue (which + * isn't needed after the node is dead). If the node is dead + * (node->proc is NULL), use binder_dead_nodes_lock to protect + * node->tmp_refs against dead-node-only cases where the node + * lock cannot be acquired (eg traversing the dead node list to + * print nodes) + */ +static void binder_inc_node_tmpref(struct binder_node *node) +{ + if (node->proc) + binder_inner_proc_lock(node->proc); + else + spin_lock(&binder_dead_nodes_lock); + binder_inc_node_tmpref_ilocked(node); + if (node->proc) + binder_inner_proc_unlock(node->proc); + else + spin_unlock(&binder_dead_nodes_lock); +} + /** * binder_dec_node_tmpref() - remove a temporary reference on node * @node: node to reference @@ -921,15 +1019,27 @@ static void binder_inc_node_tmpref(struct binder_node *node) */ static void binder_dec_node_tmpref(struct binder_node *node) { + bool free_node; + + if (node->proc) + binder_inner_proc_lock(node->proc); + else + spin_lock(&binder_dead_nodes_lock); node->tmp_refs--; BUG_ON(node->tmp_refs < 0); + if (!node->proc) + spin_unlock(&binder_dead_nodes_lock); /* * Call binder_dec_node() to check if all refcounts are 0 * and cleanup is needed. Calling with strong=0 and internal=1 * causes no actual reference to be released in binder_dec_node(). * If that changes, a change is needed here too. */ - binder_dec_node(node, 0, 1); + free_node = binder_dec_node_ilocked(node, 0, 1); + if (node->proc) + binder_inner_proc_unlock(node->proc); + if (free_node) + binder_free_node(node); } static void binder_put_node(struct binder_node *node) @@ -1042,6 +1152,9 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, static void binder_cleanup_ref(struct binder_ref *ref) { + bool delete_node = false; + struct binder_proc *node_proc = ref->node->proc; + binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d delete ref %d desc %d for node %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, @@ -1050,11 +1163,26 @@ static void binder_cleanup_ref(struct binder_ref *ref) rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc); rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node); + if (node_proc) + binder_inner_proc_lock(node_proc); if (ref->data.strong) - binder_dec_node(ref->node, 1, 1); + binder_dec_node_ilocked(ref->node, 1, 1); hlist_del(&ref->node_entry); - binder_dec_node(ref->node, 0, 1); + delete_node = binder_dec_node_ilocked(ref->node, 0, 1); + if (node_proc) + binder_inner_proc_unlock(node_proc); + /* + * Clear ref->node unless we want the caller to free the node + */ + if (!delete_node) { + /* + * The caller uses ref->node to determine + * whether the node needs to be freed. Clear + * it since the node is still alive. + */ + ref->node = NULL; + } if (ref->death) { binder_debug(BINDER_DEBUG_DEAD_BINDER, @@ -1123,13 +1251,8 @@ static bool binder_dec_ref(struct binder_ref *ref, int strong) return false; } ref->data.strong--; - if (ref->data.strong == 0) { - int ret; - - ret = binder_dec_node(ref->node, strong, 1); - if (ret) - return false; - } + if (ref->data.strong == 0) + binder_dec_node(ref->node, strong, 1); } else { if (ref->data.weak == 0) { binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n", @@ -1194,10 +1317,13 @@ err_no_ref: * binder_free_ref() - free the binder_ref * @ref: ref to free * - * Free the binder_ref and the binder_ref_death indicated by ref->death. + * Free the binder_ref. Free the binder_node indicated by ref->node + * (if non-NULL) and the binder_ref_death indicated by ref->death. */ static void binder_free_ref(struct binder_ref *ref) { + if (ref->node) + binder_free_node(ref->node); kfree(ref->death); kfree(ref); } @@ -2688,11 +2814,13 @@ static int binder_thread_write(struct binder_proc *proc, binder_put_node(node); break; } + binder_inner_proc_lock(proc); if (cmd == BC_ACQUIRE_DONE) { if (node->pending_strong_ref == 0) { binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n", proc->pid, thread->pid, node->debug_id); + binder_inner_proc_unlock(proc); binder_put_node(node); break; } @@ -2702,11 +2830,13 @@ static int binder_thread_write(struct binder_proc *proc, binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n", proc->pid, thread->pid, node->debug_id); + binder_inner_proc_unlock(proc); binder_put_node(node); break; } node->pending_weak_ref = 0; } + binder_inner_proc_unlock(proc); binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0); binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s node %d ls %d lw %d tr %d\n", @@ -3092,6 +3222,7 @@ retry: struct binder_transaction *t = NULL; struct binder_thread *t_from; + binder_inner_proc_lock(proc); if (!list_empty(&thread->todo)) { w = list_first_entry(&thread->todo, struct binder_work, entry); @@ -3105,11 +3236,15 @@ retry: break; } - if (end - ptr < sizeof(tr) + 4) + if (end - ptr < sizeof(tr) + 4) { + binder_inner_proc_unlock(proc); break; + } + list_del_init(&w->entry); switch (w->type) { case BINDER_WORK_TRANSACTION: { + binder_inner_proc_unlock(proc); t = container_of(w, struct binder_transaction, work); } break; case BINDER_WORK_RETURN_ERROR: { @@ -3117,15 +3252,16 @@ retry: w, struct binder_error, work); WARN_ON(e->cmd == BR_OK); + binder_inner_proc_unlock(proc); if (put_user(e->cmd, (uint32_t __user *)ptr)) return -EFAULT; e->cmd = BR_OK; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); - list_del(&w->entry); } break; case BINDER_WORK_TRANSACTION_COMPLETE: { + binder_inner_proc_unlock(proc); cmd = BR_TRANSACTION_COMPLETE; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; @@ -3135,8 +3271,6 @@ retry: binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE, "%d:%d BR_TRANSACTION_COMPLETE\n", proc->pid, thread->pid); - - list_del(&w->entry); kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); } break; @@ -3173,8 +3307,6 @@ retry: node->has_strong_ref = 0; if (!weak && has_weak_ref) node->has_weak_ref = 0; - list_del(&w->entry); - if (!weak && !strong) { binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx deleted\n", @@ -3183,9 +3315,11 @@ retry: (u64)node_ptr, (u64)node_cookie); rb_erase(&node->rb_node, &proc->nodes); - kfree(node); - binder_stats_deleted(BINDER_STAT_NODE); - } + binder_inner_proc_unlock(proc); + binder_free_node(node); + } else + binder_inner_proc_unlock(proc); + if (weak && !has_weak_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, @@ -3227,6 +3361,13 @@ retry: cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE; else cmd = BR_DEAD_BINDER; + /* + * TODO: there is a race condition between + * death notification requests and delivery + * of the notifications. This will be handled + * in a later patch. + */ + binder_inner_proc_unlock(proc); if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); @@ -3244,11 +3385,14 @@ retry: (u64)death->cookie); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) { - list_del(&w->entry); kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); - } else - list_move(&w->entry, &proc->delivered_death); + } else { + binder_inner_proc_lock(proc); + list_add_tail(&w->entry, + &proc->delivered_death); + binder_inner_proc_unlock(proc); + } if (cmd == BR_DEAD_BINDER) goto done; /* DEAD_BINDER notifications can cause transactions */ } break; @@ -3326,7 +3470,6 @@ retry: if (t_from) binder_thread_dec_tmpref(t_from); - list_del(&t->work.entry); t->buffer->allow_user_free = 1; if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) { t->to_parent = thread->transaction_stack; @@ -3925,16 +4068,19 @@ static int binder_node_release(struct binder_node *node, int refs) { struct binder_ref *ref; int death = 0; + struct binder_proc *proc = node->proc; - list_del_init(&node->work.entry); binder_release_work(&node->async_todo); + + binder_inner_proc_lock(proc); + list_del_init(&node->work.entry); /* * The caller must have taken a temporary ref on the node, */ BUG_ON(!node->tmp_refs); if (hlist_empty(&node->refs) && node->tmp_refs == 1) { - kfree(node); - binder_stats_deleted(BINDER_STAT_NODE); + binder_inner_proc_unlock(proc); + binder_free_node(node); return refs; } @@ -3942,6 +4088,7 @@ static int binder_node_release(struct binder_node *node, int refs) node->proc = NULL; node->local_strong_refs = 0; node->local_weak_refs = 0; + binder_inner_proc_unlock(proc); spin_lock(&binder_dead_nodes_lock); hlist_add_head(&node->dead_node, &binder_dead_nodes); From 57628830c30714d7231f7d5757b968184c8a7e3c Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 20 Oct 2016 10:33:00 -0700 Subject: [PATCH 0925/3220] FROMLIST: binder: add spinlocks to protect todo lists (from https://patchwork.kernel.org/patch/9817769/) The todo lists in the proc, thread, and node structures are accessed by other procs/threads to place work items on the queue. The todo lists are protected by the new proc->inner_lock. No locks should ever be nested under these locks. As the name suggests, an outer lock will be introduced in a later patch. Change-Id: I7720bacf5ebae4af177e22fcab0900d54c94c11a Signed-off-by: Todd Kjos --- drivers/android/binder.c | 355 +++++++++++++++++++++++++++++---------- 1 file changed, 269 insertions(+), 86 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 0bc8fe034234..36074ce8d1b1 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -279,8 +279,16 @@ struct binder_device { struct binder_context context; }; +/** + * struct binder_work - work enqueued on a worklist + * @entry: node enqueued on list + * @type: type of work to be performed + * + * There are separate work lists for proc, thread, and node (async). + */ struct binder_work { struct list_head entry; + enum { BINDER_WORK_TRANSACTION = 1, BINDER_WORK_TRANSACTION_COMPLETE, @@ -303,6 +311,7 @@ struct binder_error { * (invariant after initialized) * @lock: lock for node fields * @work: worklist element for node work + * (protected by @proc->inner_lock) * @rb_node: element for proc->nodes tree * @dead_node: element for binder_dead_nodes list * (protected by binder_dead_nodes_lock) @@ -347,6 +356,7 @@ struct binder_error { * @min_priority: minimum scheduling priority * (invariant after initialized) * @async_todo: list of async work items + * (protected by @proc->inner_lock) * * Bookkeeping structure for binder nodes. */ @@ -388,6 +398,11 @@ struct binder_node { }; struct binder_ref_death { + /** + * @work: worklist element for death notifications + * (protected by inner_lock of the proc that + * this ref belongs to) + */ struct binder_work work; binder_uintptr_t cookie; }; @@ -467,11 +482,13 @@ enum binder_deferred_state { * @is_dead: process is dead and awaiting free * when outstanding transactions are cleaned up * @todo: list of work for this process + * (protected by @inner_lock) * @wait: wait queue head to wait for proc work * (invariant after initialized) * @stats: per-process binder statistics * (atomics, no lock needed) * @delivered_death: list of delivered death notification + * (protected by @inner_lock) * @max_threads: cap on number of binder threads * @requested_threads: number of binder threads requested but not * yet started. In current implementation, can @@ -542,6 +559,7 @@ enum { * (no lock needed) * @transaction_stack: stack of in-progress transactions for this thread * @todo: list of work to do for this thread + * (protected by @proc->inner_lock) * @return_error: transaction errors reported by this thread * (only accessed by this thread) * @reply_error: transaction errors reported by target thread @@ -689,6 +707,111 @@ _binder_node_unlock(struct binder_node *node, int line) spin_unlock(&node->lock); } +static bool binder_worklist_empty_ilocked(struct list_head *list) +{ + return list_empty(list); +} + +/** + * binder_worklist_empty() - Check if no items on the work list + * @proc: binder_proc associated with list + * @list: list to check + * + * Return: true if there are no items on list, else false + */ +static bool binder_worklist_empty(struct binder_proc *proc, + struct list_head *list) +{ + bool ret; + + binder_inner_proc_lock(proc); + ret = binder_worklist_empty_ilocked(list); + binder_inner_proc_unlock(proc); + return ret; +} + +static void +binder_enqueue_work_ilocked(struct binder_work *work, + struct list_head *target_list) +{ + BUG_ON(target_list == NULL); + BUG_ON(work->entry.next && !list_empty(&work->entry)); + list_add_tail(&work->entry, target_list); +} + +/** + * binder_enqueue_work() - Add an item to the work list + * @proc: binder_proc associated with list + * @work: struct binder_work to add to list + * @target_list: list to add work to + * + * Adds the work to the specified list. Asserts that work + * is not already on a list. + */ +static void +binder_enqueue_work(struct binder_proc *proc, + struct binder_work *work, + struct list_head *target_list) +{ + binder_inner_proc_lock(proc); + binder_enqueue_work_ilocked(work, target_list); + binder_inner_proc_unlock(proc); +} + +static void +binder_dequeue_work_ilocked(struct binder_work *work) +{ + list_del_init(&work->entry); +} + +/** + * binder_dequeue_work() - Removes an item from the work list + * @proc: binder_proc associated with list + * @work: struct binder_work to remove from list + * + * Removes the specified work item from whatever list it is on. + * Can safely be called if work is not on any list. + */ +static void +binder_dequeue_work(struct binder_proc *proc, struct binder_work *work) +{ + binder_inner_proc_lock(proc); + binder_dequeue_work_ilocked(work); + binder_inner_proc_unlock(proc); +} + +static struct binder_work *binder_dequeue_work_head_ilocked( + struct list_head *list) +{ + struct binder_work *w; + + w = list_first_entry_or_null(list, struct binder_work, entry); + if (w) + list_del_init(&w->entry); + return w; +} + +/** + * binder_dequeue_work_head() - Dequeues the item at head of list + * @proc: binder_proc associated with list + * @list: list to dequeue head + * + * Removes the head of the list if there are items on the list + * + * Return: pointer dequeued binder_work, NULL if list was empty + */ +static struct binder_work *binder_dequeue_work_head( + struct binder_proc *proc, + struct list_head *list) +{ + struct binder_work *w; + + binder_inner_proc_lock(proc); + w = binder_dequeue_work_head_ilocked(list); + binder_inner_proc_unlock(proc); + return w; +} + static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); static void binder_free_thread(struct binder_thread *thread); @@ -871,8 +994,8 @@ static int binder_inc_node_ilocked(struct binder_node *node, int strong, } else node->local_strong_refs++; if (!node->has_strong_ref && target_list) { - list_del_init(&node->work.entry); - list_add_tail(&node->work.entry, target_list); + binder_dequeue_work_ilocked(&node->work); + binder_enqueue_work_ilocked(&node->work, target_list); } } else { if (!internal) @@ -883,7 +1006,7 @@ static int binder_inc_node_ilocked(struct binder_node *node, int strong, node->debug_id); return -EINVAL; } - list_add_tail(&node->work.entry, target_list); + binder_enqueue_work_ilocked(&node->work, target_list); } } return 0; @@ -927,19 +1050,20 @@ static bool binder_dec_node_ilocked(struct binder_node *node, if (proc && (node->has_strong_ref || node->has_weak_ref)) { if (list_empty(&node->work.entry)) { - list_add_tail(&node->work.entry, &node->proc->todo); + binder_enqueue_work_ilocked(&node->work, &proc->todo); wake_up_interruptible(&node->proc->wait); } } else { if (hlist_empty(&node->refs) && !node->local_strong_refs && !node->local_weak_refs && !node->tmp_refs) { - list_del_init(&node->work.entry); if (proc) { - rb_erase(&node->rb_node, &node->proc->nodes); + binder_dequeue_work_ilocked(&node->work); + rb_erase(&node->rb_node, &proc->nodes); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "refless node %d deleted\n", node->debug_id); } else { + BUG_ON(!list_empty(&node->work.entry)); spin_lock(&binder_dead_nodes_lock); /* * tmp_refs could have changed so @@ -1189,7 +1313,7 @@ static void binder_cleanup_ref(struct binder_ref *ref) "%d delete ref %d desc %d has death notification\n", ref->proc->pid, ref->data.debug_id, ref->data.desc); - list_del(&ref->death->work.entry); + binder_dequeue_work(ref->proc, &ref->death->work); binder_stats_deleted(BINDER_STAT_DEATH); } binder_stats_deleted(BINDER_STAT_REF); @@ -1540,8 +1664,9 @@ static void binder_send_failed_reply(struct binder_transaction *t, binder_pop_transaction(target_thread, t); if (target_thread->reply_error.cmd == BR_OK) { target_thread->reply_error.cmd = error_code; - list_add_tail( - &target_thread->reply_error.work.entry, + binder_enqueue_work( + target_thread->proc, + &target_thread->reply_error.work, &target_thread->todo); wake_up_interruptible(&target_thread->wait); } else { @@ -2579,7 +2704,7 @@ static void binder_transaction(struct binder_proc *proc, } } tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; - list_add_tail(&tcomplete->entry, &thread->todo); + binder_enqueue_work(proc, tcomplete, &thread->todo); if (reply) { if (target_thread->is_dead) @@ -2610,7 +2735,7 @@ static void binder_transaction(struct binder_proc *proc, goto err_dead_proc_or_thread; } t->work.type = BINDER_WORK_TRANSACTION; - list_add_tail(&t->work.entry, target_list); + binder_enqueue_work(target_proc, &t->work, target_list); if (target_wait) { if (reply || !(tr->flags & TF_ONE_WAY)) wake_up_interruptible_sync(target_wait); @@ -2686,13 +2811,15 @@ err_no_context_mgr_node: BUG_ON(thread->return_error.cmd != BR_OK); if (in_reply_to) { thread->return_error.cmd = BR_TRANSACTION_COMPLETE; - list_add_tail(&thread->return_error.work.entry, - &thread->todo); + binder_enqueue_work(thread->proc, + &thread->return_error.work, + &thread->todo); binder_send_failed_reply(in_reply_to, return_error); } else { thread->return_error.cmd = return_error; - list_add_tail(&thread->return_error.work.entry, - &thread->todo); + binder_enqueue_work(thread->proc, + &thread->return_error.work, + &thread->todo); } } @@ -2885,11 +3012,21 @@ static int binder_thread_write(struct binder_proc *proc, buffer->transaction = NULL; } if (buffer->async_transaction && buffer->target_node) { - BUG_ON(!buffer->target_node->has_async_transaction); - if (list_empty(&buffer->target_node->async_todo)) - buffer->target_node->has_async_transaction = 0; + struct binder_node *buf_node; + struct binder_work *w; + + buf_node = buffer->target_node; + BUG_ON(!buf_node->has_async_transaction); + BUG_ON(buf_node->proc != proc); + binder_inner_proc_lock(proc); + w = binder_dequeue_work_head_ilocked( + &buf_node->async_todo); + if (!w) + buf_node->has_async_transaction = 0; else - list_move_tail(buffer->target_node->async_todo.next, &thread->todo); + binder_enqueue_work_ilocked( + w, &thread->todo); + binder_inner_proc_unlock(proc); } trace_binder_transaction_buffer_release(buffer); binder_transaction_buffer_release(proc, buffer, NULL); @@ -3001,9 +3138,10 @@ static int binder_thread_write(struct binder_proc *proc, WARN_ON(thread->return_error.cmd != BR_OK); thread->return_error.cmd = BR_ERROR; - list_add_tail( - &thread->return_error.work.entry, - &thread->todo); + binder_enqueue_work( + thread->proc, + &thread->return_error.work, + &thread->todo); binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", proc->pid, thread->pid); @@ -3015,11 +3153,20 @@ static int binder_thread_write(struct binder_proc *proc, ref->death = death; if (ref->node->proc == NULL) { ref->death->work.type = BINDER_WORK_DEAD_BINDER; - if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) { - list_add_tail(&ref->death->work.entry, &thread->todo); - } else { - list_add_tail(&ref->death->work.entry, &proc->todo); - wake_up_interruptible(&proc->wait); + if (thread->looper & + (BINDER_LOOPER_STATE_REGISTERED | + BINDER_LOOPER_STATE_ENTERED)) + binder_enqueue_work( + proc, + &ref->death->work, + &thread->todo); + else { + binder_enqueue_work( + proc, + &ref->death->work, + &proc->todo); + wake_up_interruptible( + &proc->wait); } } } else { @@ -3037,18 +3184,27 @@ static int binder_thread_write(struct binder_proc *proc, break; } ref->death = NULL; + binder_inner_proc_lock(proc); if (list_empty(&death->work.entry)) { death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; - if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) { - list_add_tail(&death->work.entry, &thread->todo); - } else { - list_add_tail(&death->work.entry, &proc->todo); - wake_up_interruptible(&proc->wait); + if (thread->looper & + (BINDER_LOOPER_STATE_REGISTERED | + BINDER_LOOPER_STATE_ENTERED)) + binder_enqueue_work_ilocked( + &death->work, + &thread->todo); + else { + binder_enqueue_work_ilocked( + &death->work, + &proc->todo); + wake_up_interruptible( + &proc->wait); } } else { BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER); death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR; } + binder_inner_proc_unlock(proc); } } break; case BC_DEAD_BINDER_DONE: { @@ -3060,8 +3216,13 @@ static int binder_thread_write(struct binder_proc *proc, return -EFAULT; ptr += sizeof(cookie); - list_for_each_entry(w, &proc->delivered_death, entry) { - struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work); + binder_inner_proc_lock(proc); + list_for_each_entry(w, &proc->delivered_death, + entry) { + struct binder_ref_death *tmp_death = + container_of(w, + struct binder_ref_death, + work); if (tmp_death->cookie == cookie) { death = tmp_death; @@ -3075,19 +3236,25 @@ static int binder_thread_write(struct binder_proc *proc, if (death == NULL) { binder_user_error("%d:%d BC_DEAD_BINDER_DONE %016llx not found\n", proc->pid, thread->pid, (u64)cookie); + binder_inner_proc_unlock(proc); break; } - - list_del_init(&death->work.entry); + binder_dequeue_work_ilocked(&death->work); if (death->work.type == BINDER_WORK_DEAD_BINDER_AND_CLEAR) { death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; - if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) { - list_add_tail(&death->work.entry, &thread->todo); - } else { - list_add_tail(&death->work.entry, &proc->todo); + if (thread->looper & + (BINDER_LOOPER_STATE_REGISTERED | + BINDER_LOOPER_STATE_ENTERED)) + binder_enqueue_work_ilocked( + &death->work, &thread->todo); + else { + binder_enqueue_work_ilocked( + &death->work, + &proc->todo); wake_up_interruptible(&proc->wait); } } + binder_inner_proc_unlock(proc); } break; default: @@ -3114,12 +3281,14 @@ static void binder_stat_br(struct binder_proc *proc, static int binder_has_proc_work(struct binder_proc *proc, struct binder_thread *thread) { - return !list_empty(&proc->todo) || thread->looper_need_return; + return !binder_worklist_empty(proc, &proc->todo) || + thread->looper_need_return; } static int binder_has_thread_work(struct binder_thread *thread) { - return !list_empty(&thread->todo) || thread->looper_need_return; + return !binder_worklist_empty(thread->proc, &thread->todo) || + thread->looper_need_return; } static int binder_put_node_cmd(struct binder_proc *proc, @@ -3173,7 +3342,7 @@ static int binder_thread_read(struct binder_proc *proc, retry: wait_for_proc_work = thread->transaction_stack == NULL && - list_empty(&thread->todo); + binder_worklist_empty(proc, &thread->todo); thread->looper |= BINDER_LOOPER_STATE_WAITING; if (wait_for_proc_work) @@ -3183,7 +3352,7 @@ retry: trace_binder_wait_for_work(wait_for_proc_work, !!thread->transaction_stack, - !list_empty(&thread->todo)); + !binder_worklist_empty(proc, &thread->todo)); if (wait_for_proc_work) { if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED))) { @@ -3218,18 +3387,20 @@ retry: while (1) { uint32_t cmd; struct binder_transaction_data tr; - struct binder_work *w; + struct binder_work *w = NULL; + struct list_head *list = NULL; struct binder_transaction *t = NULL; struct binder_thread *t_from; binder_inner_proc_lock(proc); - if (!list_empty(&thread->todo)) { - w = list_first_entry(&thread->todo, struct binder_work, - entry); - } else if (!list_empty(&proc->todo) && wait_for_proc_work) { - w = list_first_entry(&proc->todo, struct binder_work, - entry); - } else { + if (!binder_worklist_empty_ilocked(&thread->todo)) + list = &thread->todo; + else if (!binder_worklist_empty_ilocked(&proc->todo) && + wait_for_proc_work) + list = &proc->todo; + else { + binder_inner_proc_unlock(proc); + /* no data added */ if (ptr - buffer == 4 && !thread->looper_need_return) goto retry; @@ -3240,7 +3411,7 @@ retry: binder_inner_proc_unlock(proc); break; } - list_del_init(&w->entry); + w = binder_dequeue_work_head_ilocked(list); switch (w->type) { case BINDER_WORK_TRANSACTION: { @@ -3389,8 +3560,8 @@ retry: binder_stats_deleted(BINDER_STAT_DEATH); } else { binder_inner_proc_lock(proc); - list_add_tail(&w->entry, - &proc->delivered_death); + binder_enqueue_work_ilocked( + w, &proc->delivered_death); binder_inner_proc_unlock(proc); } if (cmd == BR_DEAD_BINDER) @@ -3500,13 +3671,16 @@ done: return 0; } -static void binder_release_work(struct list_head *list) +static void binder_release_work(struct binder_proc *proc, + struct list_head *list) { struct binder_work *w; - while (!list_empty(list)) { - w = list_first_entry(list, struct binder_work, entry); - list_del_init(&w->entry); + while (1) { + w = binder_dequeue_work_head(proc, list); + if (!w) + return; + switch (w->type) { case BINDER_WORK_TRANSACTION: { struct binder_transaction *t; @@ -3670,7 +3844,7 @@ static int binder_thread_release(struct binder_proc *proc, if (send_reply) binder_send_failed_reply(send_reply, BR_DEAD_REPLY); - binder_release_work(&thread->todo); + binder_release_work(proc, &thread->todo); binder_thread_dec_tmpref(thread); return active_transactions; } @@ -3687,7 +3861,7 @@ static unsigned int binder_poll(struct file *filp, thread = binder_get_thread(proc); wait_for_proc_work = thread->transaction_stack == NULL && - list_empty(&thread->todo); + binder_worklist_empty(proc, &thread->todo); binder_unlock(__func__); @@ -3750,7 +3924,7 @@ static int binder_ioctl_write_read(struct file *filp, &bwr.read_consumed, filp->f_flags & O_NONBLOCK); trace_binder_read_done(ret); - if (!list_empty(&proc->todo)) + if (!binder_worklist_empty(proc, &proc->todo)) wake_up_interruptible(&proc->wait); if (ret < 0) { if (copy_to_user(ubuf, &bwr, sizeof(bwr))) @@ -4070,10 +4244,10 @@ static int binder_node_release(struct binder_node *node, int refs) int death = 0; struct binder_proc *proc = node->proc; - binder_release_work(&node->async_todo); + binder_release_work(proc, &node->async_todo); binder_inner_proc_lock(proc); - list_del_init(&node->work.entry); + binder_dequeue_work_ilocked(&node->work); /* * The caller must have taken a temporary ref on the node, */ @@ -4102,13 +4276,15 @@ static int binder_node_release(struct binder_node *node, int refs) death++; + binder_inner_proc_lock(ref->proc); if (list_empty(&ref->death->work.entry)) { ref->death->work.type = BINDER_WORK_DEAD_BINDER; - list_add_tail(&ref->death->work.entry, - &ref->proc->todo); + binder_enqueue_work_ilocked(&ref->death->work, + &ref->proc->todo); wake_up_interruptible(&ref->proc->wait); } else BUG(); + binder_inner_proc_unlock(ref->proc); } binder_debug(BINDER_DEBUG_DEAD_BINDER, @@ -4184,8 +4360,8 @@ static void binder_deferred_release(struct binder_proc *proc) binder_free_ref(ref); } - binder_release_work(&proc->todo); - binder_release_work(&proc->delivered_death); + binder_release_work(proc, &proc->todo); + binder_release_work(proc, &proc->delivered_death); binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n", @@ -4276,9 +4452,9 @@ static void print_binder_transaction(struct seq_file *m, const char *prefix, t->buffer->data); } -static void print_binder_work(struct seq_file *m, const char *prefix, - const char *transaction_prefix, - struct binder_work *w) +static void print_binder_work_ilocked(struct seq_file *m, const char *prefix, + const char *transaction_prefix, + struct binder_work *w) { struct binder_node *node; struct binder_transaction *t; @@ -4319,15 +4495,16 @@ static void print_binder_work(struct seq_file *m, const char *prefix, } } -static void print_binder_thread(struct seq_file *m, - struct binder_thread *thread, - int print_always) +static void print_binder_thread_ilocked(struct seq_file *m, + struct binder_thread *thread, + int print_always) { struct binder_transaction *t; struct binder_work *w; size_t start_pos = m->count; size_t header_pos; + WARN_ON(!spin_is_locked(&thread->proc->inner_lock)); seq_printf(m, " thread %d: l %02x need_return %d tr %d\n", thread->pid, thread->looper, thread->looper_need_return, @@ -4349,7 +4526,8 @@ static void print_binder_thread(struct seq_file *m, } } list_for_each_entry(w, &thread->todo, entry) { - print_binder_work(m, " ", " pending transaction", w); + print_binder_work_ilocked(m, " ", + " pending transaction", w); } if (!print_always && m->count == header_pos) m->count = start_pos; @@ -4376,9 +4554,13 @@ static void print_binder_node(struct seq_file *m, struct binder_node *node) seq_printf(m, " %d", ref->proc->pid); } seq_puts(m, "\n"); - list_for_each_entry(w, &node->async_todo, entry) - print_binder_work(m, " ", - " pending async transaction", w); + if (node->proc) { + binder_inner_proc_lock(node->proc); + list_for_each_entry(w, &node->async_todo, entry) + print_binder_work_ilocked(m, " ", + " pending async transaction", w); + binder_inner_proc_unlock(node->proc); + } } static void print_binder_ref(struct seq_file *m, struct binder_ref *ref) @@ -4402,9 +4584,11 @@ static void print_binder_proc(struct seq_file *m, seq_printf(m, "context %s\n", proc->context->name); header_pos = m->count; + binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) - print_binder_thread(m, rb_entry(n, struct binder_thread, + print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread, rb_node), print_all); + binder_inner_proc_unlock(proc); for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { struct binder_node *node = rb_entry(n, struct binder_node, rb_node); @@ -4419,12 +4603,14 @@ static void print_binder_proc(struct seq_file *m, rb_node_desc)); } binder_alloc_print_allocated(m, &proc->alloc); + binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) - print_binder_work(m, " ", " pending transaction", w); + print_binder_work_ilocked(m, " ", " pending transaction", w); list_for_each_entry(w, &proc->delivered_death, entry) { seq_puts(m, " has delivered dead binder\n"); break; } + binder_inner_proc_unlock(proc); if (!print_all && m->count == header_pos) m->count = start_pos; } @@ -4563,15 +4749,12 @@ static void print_binder_proc_stats(struct seq_file *m, seq_printf(m, " buffers: %d\n", count); count = 0; + binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) { - switch (w->type) { - case BINDER_WORK_TRANSACTION: + if (w->type == BINDER_WORK_TRANSACTION) count++; - break; - default: - break; - } } + binder_inner_proc_unlock(proc); seq_printf(m, " pending transactions: %d\n", count); print_binder_stats(m, " ", &proc->stats); From 14c312e9262393410ed351b04a44057220e31f6a Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 8 Jun 2017 13:45:59 -0700 Subject: [PATCH 0926/3220] FROMLIST: binder: add spinlock to protect binder_node (from https://patchwork.kernel.org/patch/9817769/) node->node_lock is used to protect elements of node. No need to acquire for fields that are invariant: debug_id, ptr, cookie. Change-Id: Ib7738e52fa7689767f17136e18cc05ff548b5717 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 220 +++++++++++++++++++++++++++++---------- 1 file changed, 165 insertions(+), 55 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 36074ce8d1b1..2ca40f4d0bb5 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -318,6 +318,7 @@ struct binder_error { * @proc: binder_proc that owns this node * (invariant after initialized) * @refs: list of references on this node + * (protected by @lock) * @internal_strong_refs: used to take strong references when * initiating a transaction * (protected by @proc->inner_lock if @proc @@ -351,6 +352,7 @@ struct binder_error { * (protected by @proc->inner_lock if @proc * and by @lock) * @has_async_transaction: async transaction to node in progress + * (protected by @lock) * @accept_fds: file descriptor operations supported for node * (invariant after initialized) * @min_priority: minimum scheduling priority @@ -432,6 +434,7 @@ struct binder_ref_data { * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree * @rb_node_node: node for lookup by @node in proc's rb_tree * @node_entry: list entry for node->refs list in target node + * (protected by @node->lock) * @proc: binder_proc containing ref * @node: binder_node of target node. When cleaning up a * ref for deletion in binder_cleanup_ref, a non-NULL @@ -707,6 +710,43 @@ _binder_node_unlock(struct binder_node *node, int line) spin_unlock(&node->lock); } +/** + * binder_node_inner_lock() - Acquire node and inner locks + * @node: struct binder_node to acquire + * + * Acquires node->lock. If node->proc also acquires + * proc->inner_lock. Used to protect binder_node fields + */ +#define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__) +static void +_binder_node_inner_lock(struct binder_node *node, int line) +{ + binder_debug(BINDER_DEBUG_SPINLOCKS, + "%s: line=%d\n", __func__, line); + spin_lock(&node->lock); + if (node->proc) + binder_inner_proc_lock(node->proc); +} + +/** + * binder_node_unlock() - Release node and inner locks + * @node: struct binder_node to acquire + * + * Release lock acquired via binder_node_lock() + */ +#define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__) +static void +_binder_node_inner_unlock(struct binder_node *node, int line) +{ + struct binder_proc *proc = node->proc; + + binder_debug(BINDER_DEBUG_SPINLOCKS, + "%s: line=%d\n", __func__, line); + if (proc) + binder_inner_proc_unlock(proc); + spin_unlock(&node->lock); +} + static bool binder_worklist_empty_ilocked(struct list_head *list) { return list_empty(list); @@ -925,12 +965,14 @@ static struct binder_node *binder_get_node(struct binder_proc *proc, } static struct binder_node *binder_new_node(struct binder_proc *proc, - binder_uintptr_t ptr, - binder_uintptr_t cookie) + struct flat_binder_object *fp) { struct rb_node **p = &proc->nodes.rb_node; struct rb_node *parent = NULL; struct binder_node *node; + binder_uintptr_t ptr = fp ? fp->binder : 0; + binder_uintptr_t cookie = fp ? fp->cookie : 0; + __u32 flags = fp ? fp->flags : 0; while (*p) { parent = *p; @@ -956,6 +998,8 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, node->ptr = ptr; node->cookie = cookie; node->work.type = BINDER_WORK_NODE; + node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; + node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); INIT_LIST_HEAD(&node->async_todo); @@ -972,12 +1016,15 @@ static void binder_free_node(struct binder_node *node) binder_stats_deleted(BINDER_STAT_NODE); } -static int binder_inc_node_ilocked(struct binder_node *node, int strong, - int internal, - struct list_head *target_list) +static int binder_inc_node_nilocked(struct binder_node *node, int strong, + int internal, + struct list_head *target_list) { - if (node->proc) - BUG_ON(!spin_is_locked(&node->proc->inner_lock)); + struct binder_proc *proc = node->proc; + + BUG_ON(!spin_is_locked(&node->lock)); + if (proc) + BUG_ON(!spin_is_locked(&proc->inner_lock)); if (strong) { if (internal) { if (target_list == NULL && @@ -1017,20 +1064,19 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal, { int ret; - if (node->proc) - binder_inner_proc_lock(node->proc); - ret = binder_inc_node_ilocked(node, strong, internal, target_list); - if (node->proc) - binder_inner_proc_unlock(node->proc); + binder_node_inner_lock(node); + ret = binder_inc_node_nilocked(node, strong, internal, target_list); + binder_node_inner_unlock(node); return ret; } -static bool binder_dec_node_ilocked(struct binder_node *node, - int strong, int internal) +static bool binder_dec_node_nilocked(struct binder_node *node, + int strong, int internal) { struct binder_proc *proc = node->proc; + BUG_ON(!spin_is_locked(&node->lock)); if (proc) BUG_ON(!spin_is_locked(&proc->inner_lock)); if (strong) { @@ -1089,12 +1135,9 @@ static void binder_dec_node(struct binder_node *node, int strong, int internal) { bool free_node; - if (node->proc) - binder_inner_proc_lock(node->proc); - free_node = binder_dec_node_ilocked(node, strong, internal); - if (node->proc) - binder_inner_proc_unlock(node->proc); - + binder_node_inner_lock(node); + free_node = binder_dec_node_nilocked(node, strong, internal); + binder_node_inner_unlock(node); if (free_node) binder_free_node(node); } @@ -1124,6 +1167,7 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node) */ static void binder_inc_node_tmpref(struct binder_node *node) { + binder_node_lock(node); if (node->proc) binder_inner_proc_lock(node->proc); else @@ -1133,6 +1177,7 @@ static void binder_inc_node_tmpref(struct binder_node *node) binder_inner_proc_unlock(node->proc); else spin_unlock(&binder_dead_nodes_lock); + binder_node_unlock(node); } /** @@ -1145,9 +1190,8 @@ static void binder_dec_node_tmpref(struct binder_node *node) { bool free_node; - if (node->proc) - binder_inner_proc_lock(node->proc); - else + binder_node_inner_lock(node); + if (!node->proc) spin_lock(&binder_dead_nodes_lock); node->tmp_refs--; BUG_ON(node->tmp_refs < 0); @@ -1159,9 +1203,8 @@ static void binder_dec_node_tmpref(struct binder_node *node) * causes no actual reference to be released in binder_dec_node(). * If that changes, a change is needed here too. */ - free_node = binder_dec_node_ilocked(node, 0, 1); - if (node->proc) - binder_inner_proc_unlock(node->proc); + free_node = binder_dec_node_nilocked(node, 0, 1); + binder_node_inner_unlock(node); if (free_node) binder_free_node(node); } @@ -1265,19 +1308,21 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, } rb_link_node(&new_ref->rb_node_desc, parent, p); rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc); + + binder_node_lock(node); hlist_add_head(&new_ref->node_entry, &node->refs); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d new ref %d desc %d for node %d\n", proc->pid, new_ref->data.debug_id, new_ref->data.desc, node->debug_id); + binder_node_unlock(node); return new_ref; } static void binder_cleanup_ref(struct binder_ref *ref) { bool delete_node = false; - struct binder_proc *node_proc = ref->node->proc; binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d delete ref %d desc %d for node %d\n", @@ -1287,15 +1332,13 @@ static void binder_cleanup_ref(struct binder_ref *ref) rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc); rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node); - if (node_proc) - binder_inner_proc_lock(node_proc); + binder_node_inner_lock(ref->node); if (ref->data.strong) - binder_dec_node_ilocked(ref->node, 1, 1); + binder_dec_node_nilocked(ref->node, 1, 1); hlist_del(&ref->node_entry); - delete_node = binder_dec_node_ilocked(ref->node, 0, 1); - if (node_proc) - binder_inner_proc_unlock(node_proc); + delete_node = binder_dec_node_nilocked(ref->node, 0, 1); + binder_node_inner_unlock(ref->node); /* * Clear ref->node unless we want the caller to free the node */ @@ -1990,12 +2033,9 @@ static int binder_translate_binder(struct flat_binder_object *fp, node = binder_get_node(proc, fp->binder); if (!node) { - node = binder_new_node(proc, fp->binder, fp->cookie); + node = binder_new_node(proc, fp); if (!node) return -ENOMEM; - - node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK; - node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); } if (fp->cookie != node->cookie) { binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", @@ -2056,6 +2096,7 @@ static int binder_translate_handle(struct flat_binder_object *fp, goto done; } + binder_node_lock(node); if (node->proc == target_proc) { if (fp->hdr.type == BINDER_TYPE_HANDLE) fp->hdr.type = BINDER_TYPE_BINDER; @@ -2063,18 +2104,24 @@ static int binder_translate_handle(struct flat_binder_object *fp, fp->hdr.type = BINDER_TYPE_WEAK_BINDER; fp->binder = node->ptr; fp->cookie = node->cookie; - binder_inc_node(node, - fp->hdr.type == BINDER_TYPE_BINDER, - 0, NULL); + if (node->proc) + binder_inner_proc_lock(node->proc); + binder_inc_node_nilocked(node, + fp->hdr.type == BINDER_TYPE_BINDER, + 0, NULL); + if (node->proc) + binder_inner_proc_unlock(node->proc); trace_binder_transaction_ref_to_node(t, node, &src_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> node %d u%016llx\n", src_rdata.debug_id, src_rdata.desc, node->debug_id, (u64)node->ptr); + binder_node_unlock(node); } else { int ret; struct binder_ref_data dest_rdata; + binder_node_unlock(node); ret = binder_inc_ref_for_node(target_proc, node, fp->hdr.type == BINDER_TYPE_HANDLE, NULL, &dest_rdata); @@ -2382,13 +2429,16 @@ static void binder_transaction(struct binder_proc *proc, mutex_unlock(&context->context_mgr_node_lock); } e->to_node = target_node->debug_id; + binder_node_lock(target_node); target_proc = target_node->proc; if (target_proc == NULL) { + binder_node_unlock(target_node); return_error = BR_DEAD_REPLY; return_error_line = __LINE__; goto err_dead_binder; } target_proc->tmp_ref++; + binder_node_unlock(target_node); if (security_binder_transaction(proc->tsk, target_proc->tsk) < 0) { return_error = BR_FAILED_REPLY; @@ -2705,6 +2755,7 @@ static void binder_transaction(struct binder_proc *proc, } tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; binder_enqueue_work(proc, tcomplete, &thread->todo); + t->work.type = BINDER_WORK_TRANSACTION; if (reply) { if (target_thread->is_dead) @@ -2712,6 +2763,7 @@ static void binder_transaction(struct binder_proc *proc, BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction(target_thread, in_reply_to); binder_free_transaction(in_reply_to); + binder_enqueue_work(target_proc, &t->work, target_list); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); t->need_reply = 1; @@ -2722,20 +2774,29 @@ static void binder_transaction(struct binder_proc *proc, binder_pop_transaction(thread, t); goto err_dead_proc_or_thread; } + binder_enqueue_work(target_proc, &t->work, target_list); } else { BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); + binder_node_lock(target_node); if (target_node->has_async_transaction) { target_list = &target_node->async_todo; target_wait = NULL; } else target_node->has_async_transaction = 1; + /* + * Test/set of has_async_transaction + * must be atomic with enqueue on + * async_todo + */ if (target_proc->is_dead || - (target_thread && target_thread->is_dead)) + (target_thread && target_thread->is_dead)) { + binder_node_unlock(target_node); goto err_dead_proc_or_thread; + } + binder_enqueue_work(target_proc, &t->work, target_list); + binder_node_unlock(target_node); } - t->work.type = BINDER_WORK_TRANSACTION; - binder_enqueue_work(target_proc, &t->work, target_list); if (target_wait) { if (reply || !(tr->flags & TF_ONE_WAY)) wake_up_interruptible_sync(target_wait); @@ -2914,6 +2975,7 @@ static int binder_thread_write(struct binder_proc *proc, binder_uintptr_t node_ptr; binder_uintptr_t cookie; struct binder_node *node; + bool free_node; if (get_user(node_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; @@ -2941,13 +3003,13 @@ static int binder_thread_write(struct binder_proc *proc, binder_put_node(node); break; } - binder_inner_proc_lock(proc); + binder_node_inner_lock(node); if (cmd == BC_ACQUIRE_DONE) { if (node->pending_strong_ref == 0) { binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n", proc->pid, thread->pid, node->debug_id); - binder_inner_proc_unlock(proc); + binder_node_inner_unlock(node); binder_put_node(node); break; } @@ -2957,20 +3019,22 @@ static int binder_thread_write(struct binder_proc *proc, binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n", proc->pid, thread->pid, node->debug_id); - binder_inner_proc_unlock(proc); + binder_node_inner_unlock(node); binder_put_node(node); break; } node->pending_weak_ref = 0; } - binder_inner_proc_unlock(proc); - binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0); + free_node = binder_dec_node_nilocked(node, + cmd == BC_ACQUIRE_DONE, 0); + WARN_ON(free_node); binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s node %d ls %d lw %d tr %d\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", node->debug_id, node->local_strong_refs, node->local_weak_refs, node->tmp_refs); + binder_node_inner_unlock(node); binder_put_node(node); break; } @@ -3016,9 +3080,9 @@ static int binder_thread_write(struct binder_proc *proc, struct binder_work *w; buf_node = buffer->target_node; + binder_node_inner_lock(buf_node); BUG_ON(!buf_node->has_async_transaction); BUG_ON(buf_node->proc != proc); - binder_inner_proc_lock(proc); w = binder_dequeue_work_head_ilocked( &buf_node->async_todo); if (!w) @@ -3026,7 +3090,7 @@ static int binder_thread_write(struct binder_proc *proc, else binder_enqueue_work_ilocked( w, &thread->todo); - binder_inner_proc_unlock(proc); + binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); binder_transaction_buffer_release(proc, buffer, NULL); @@ -3151,6 +3215,7 @@ static int binder_thread_write(struct binder_proc *proc, INIT_LIST_HEAD(&death->work.entry); death->cookie = cookie; ref->death = death; + binder_node_lock(ref->node); if (ref->node->proc == NULL) { ref->death->work.type = BINDER_WORK_DEAD_BINDER; if (thread->looper & @@ -3169,10 +3234,13 @@ static int binder_thread_write(struct binder_proc *proc, &proc->wait); } } + binder_node_unlock(ref->node); } else { + binder_node_lock(ref->node); if (ref->death == NULL) { binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n", proc->pid, thread->pid); + binder_node_unlock(ref->node); break; } death = ref->death; @@ -3181,6 +3249,7 @@ static int binder_thread_write(struct binder_proc *proc, proc->pid, thread->pid, (u64)death->cookie, (u64)cookie); + binder_node_unlock(ref->node); break; } ref->death = NULL; @@ -3205,6 +3274,7 @@ static int binder_thread_write(struct binder_proc *proc, death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR; } binder_inner_proc_unlock(proc); + binder_node_unlock(ref->node); } } break; case BC_DEAD_BINDER_DONE: { @@ -3487,6 +3557,17 @@ retry: (u64)node_cookie); rb_erase(&node->rb_node, &proc->nodes); binder_inner_proc_unlock(proc); + binder_node_lock(node); + /* + * Acquire the node lock before freeing the + * node to serialize with other threads that + * may have been holding the node lock while + * decrementing this node (avoids race where + * this thread frees while the other thread + * is unlocking the node after the final + * decrement) + */ + binder_node_unlock(node); binder_free_node(node); } else binder_inner_proc_unlock(proc); @@ -3974,16 +4055,18 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) } else { context->binder_context_mgr_uid = curr_euid; } - new_node = binder_new_node(proc, 0, 0); + new_node = binder_new_node(proc, NULL); if (!new_node) { ret = -ENOMEM; goto out; } + binder_node_lock(new_node); new_node->local_weak_refs++; new_node->local_strong_refs++; new_node->has_strong_ref = 1; new_node->has_weak_ref = 1; context->binder_context_mgr_node = new_node; + binder_node_unlock(new_node); binder_put_node(new_node); out: mutex_unlock(&context->context_mgr_node_lock); @@ -4246,6 +4329,7 @@ static int binder_node_release(struct binder_node *node, int refs) binder_release_work(proc, &node->async_todo); + binder_node_lock(node); binder_inner_proc_lock(proc); binder_dequeue_work_ilocked(&node->work); /* @@ -4254,6 +4338,7 @@ static int binder_node_release(struct binder_node *node, int refs) BUG_ON(!node->tmp_refs); if (hlist_empty(&node->refs) && node->tmp_refs == 1) { binder_inner_proc_unlock(proc); + binder_node_unlock(node); binder_free_node(node); return refs; @@ -4290,6 +4375,7 @@ static int binder_node_release(struct binder_node *node, int refs) binder_debug(BINDER_DEBUG_DEAD_BINDER, "node %d now dead, refs %d, death %d\n", node->debug_id, refs, death); + binder_node_unlock(node); binder_put_node(node); return refs; @@ -4533,12 +4619,15 @@ static void print_binder_thread_ilocked(struct seq_file *m, m->count = start_pos; } -static void print_binder_node(struct seq_file *m, struct binder_node *node) +static void print_binder_node_nlocked(struct seq_file *m, + struct binder_node *node) { struct binder_ref *ref; struct binder_work *w; int count; + WARN_ON(!spin_is_locked(&node->lock)); + count = 0; hlist_for_each_entry(ref, &node->refs, node_entry) count++; @@ -4565,11 +4654,13 @@ static void print_binder_node(struct seq_file *m, struct binder_node *node) static void print_binder_ref(struct seq_file *m, struct binder_ref *ref) { + binder_node_lock(ref->node); seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %pK\n", ref->data.debug_id, ref->data.desc, ref->node->proc ? "" : "dead ", ref->node->debug_id, ref->data.strong, ref->data.weak, ref->death); + binder_node_unlock(ref->node); } static void print_binder_proc(struct seq_file *m, @@ -4592,8 +4683,10 @@ static void print_binder_proc(struct seq_file *m, for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { struct binder_node *node = rb_entry(n, struct binder_node, rb_node); + binder_node_lock(node); if (print_all || node->has_async_transaction) - print_binder_node(m, node); + print_binder_node_nlocked(m, node); + binder_node_unlock(node); } if (print_all) { for (n = rb_first(&proc->refs_by_desc); @@ -4765,6 +4858,7 @@ static int binder_state_show(struct seq_file *m, void *unused) { struct binder_proc *proc; struct binder_node *node; + struct binder_node *last_node = NULL; binder_lock(__func__); @@ -4773,9 +4867,25 @@ static int binder_state_show(struct seq_file *m, void *unused) spin_lock(&binder_dead_nodes_lock); if (!hlist_empty(&binder_dead_nodes)) seq_puts(m, "dead nodes:\n"); - hlist_for_each_entry(node, &binder_dead_nodes, dead_node) - print_binder_node(m, node); + hlist_for_each_entry(node, &binder_dead_nodes, dead_node) { + /* + * take a temporary reference on the node so it + * survives and isn't removed from the list + * while we print it. + */ + node->tmp_refs++; + spin_unlock(&binder_dead_nodes_lock); + if (last_node) + binder_put_node(last_node); + binder_node_lock(node); + print_binder_node_nlocked(m, node); + binder_node_unlock(node); + last_node = node; + spin_lock(&binder_dead_nodes_lock); + } spin_unlock(&binder_dead_nodes_lock); + if (last_node) + binder_put_node(last_node); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) From 46655970b9673b2b60eb67bd902743455dfdebe3 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 12 Jun 2017 12:07:26 -0700 Subject: [PATCH 0927/3220] FROMLIST: binder: protect proc->nodes with inner lock (from https://patchwork.kernel.org/patch/9817783/) When locks for binder_ref handling are added, proc->nodes will need to be modified while holding the outer lock Change-Id: I17b39e981c55130c14a62fe49900eceff6e3642b Signed-off-by: Todd Kjos --- drivers/android/binder.c | 112 +++++++++++++++++++++++++++++++-------- 1 file changed, 89 insertions(+), 23 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 2ca40f4d0bb5..a0c0bd05a885 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -313,6 +313,7 @@ struct binder_error { * @work: worklist element for node work * (protected by @proc->inner_lock) * @rb_node: element for proc->nodes tree + * (protected by @proc->inner_lock) * @dead_node: element for binder_dead_nodes list * (protected by binder_dead_nodes_lock) * @proc: binder_proc that owns this node @@ -470,6 +471,7 @@ enum binder_deferred_state { * @threads: rbtree of binder_threads in this proc * @nodes: rbtree of binder nodes associated with * this proc ordered by node->ptr + * (protected by @inner_lock) * @refs_by_desc: rbtree of refs ordered by ref->desc * @refs_by_node: rbtree of refs ordered by ref->node * @pid PID of group_leader of process @@ -856,7 +858,7 @@ static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); static void binder_free_thread(struct binder_thread *thread); static void binder_free_proc(struct binder_proc *proc); -static void binder_inc_node_tmpref(struct binder_node *node); +static void binder_inc_node_tmpref_ilocked(struct binder_node *node); static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) { @@ -938,12 +940,14 @@ static void binder_set_nice(long nice) binder_user_error("%d RLIMIT_NICE not set\n", current->pid); } -static struct binder_node *binder_get_node(struct binder_proc *proc, - binder_uintptr_t ptr) +static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, + binder_uintptr_t ptr) { struct rb_node *n = proc->nodes.rb_node; struct binder_node *node; + BUG_ON(!spin_is_locked(&proc->inner_lock)); + while (n) { node = rb_entry(n, struct binder_node, rb_node); @@ -957,15 +961,28 @@ static struct binder_node *binder_get_node(struct binder_proc *proc, * to ensure node stays alive until * call to binder_put_node() */ - binder_inc_node_tmpref(node); + binder_inc_node_tmpref_ilocked(node); return node; } } return NULL; } -static struct binder_node *binder_new_node(struct binder_proc *proc, - struct flat_binder_object *fp) +static struct binder_node *binder_get_node(struct binder_proc *proc, + binder_uintptr_t ptr) +{ + struct binder_node *node; + + binder_inner_proc_lock(proc); + node = binder_get_node_ilocked(proc, ptr); + binder_inner_proc_unlock(proc); + return node; +} + +static struct binder_node *binder_init_node_ilocked( + struct binder_proc *proc, + struct binder_node *new_node, + struct flat_binder_object *fp) { struct rb_node **p = &proc->nodes.rb_node; struct rb_node *parent = NULL; @@ -974,7 +991,9 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, binder_uintptr_t cookie = fp ? fp->cookie : 0; __u32 flags = fp ? fp->flags : 0; + BUG_ON(!spin_is_locked(&proc->inner_lock)); while (*p) { + parent = *p; node = rb_entry(parent, struct binder_node, rb_node); @@ -982,13 +1001,17 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, p = &(*p)->rb_left; else if (ptr > node->ptr) p = &(*p)->rb_right; - else - return NULL; + else { + /* + * A matching node is already in + * the rb tree. Abandon the init + * and return it. + */ + binder_inc_node_tmpref_ilocked(node); + return node; + } } - - node = kzalloc(sizeof(*node), GFP_KERNEL); - if (node == NULL) - return NULL; + node = new_node; binder_stats_created(BINDER_STAT_NODE); node->tmp_refs++; rb_link_node(&node->rb_node, parent, p); @@ -1007,6 +1030,27 @@ static struct binder_node *binder_new_node(struct binder_proc *proc, "%d:%d node %d u%016llx c%016llx created\n", proc->pid, current->pid, node->debug_id, (u64)node->ptr, (u64)node->cookie); + + return node; +} + +static struct binder_node *binder_new_node(struct binder_proc *proc, + struct flat_binder_object *fp) +{ + struct binder_node *node; + struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL); + + if (!new_node) + return NULL; + binder_inner_proc_lock(proc); + node = binder_init_node_ilocked(proc, new_node, fp); + binder_inner_proc_unlock(proc); + if (node != new_node) + /* + * The node was already added by another thread + */ + kfree(new_node); + return node; } @@ -4421,6 +4465,7 @@ static void binder_deferred_release(struct binder_proc *proc) nodes = 0; incoming_refs = 0; + binder_inner_proc_lock(proc); while ((n = rb_first(&proc->nodes))) { struct binder_node *node; @@ -4431,10 +4476,13 @@ static void binder_deferred_release(struct binder_proc *proc) * calling binder_node_release() which will either * kfree() the node or call binder_put_node() */ - binder_inc_node_tmpref(node); + binder_inc_node_tmpref_ilocked(node); rb_erase(&node->rb_node, &proc->nodes); + binder_inner_proc_unlock(proc); incoming_refs = binder_node_release(node, incoming_refs); + binder_inner_proc_lock(proc); } + binder_inner_proc_unlock(proc); outgoing_refs = 0; while ((n = rb_first(&proc->refs_by_desc))) { @@ -4619,14 +4667,16 @@ static void print_binder_thread_ilocked(struct seq_file *m, m->count = start_pos; } -static void print_binder_node_nlocked(struct seq_file *m, - struct binder_node *node) +static void print_binder_node_nilocked(struct seq_file *m, + struct binder_node *node) { struct binder_ref *ref; struct binder_work *w; int count; WARN_ON(!spin_is_locked(&node->lock)); + if (node->proc) + WARN_ON(!spin_is_locked(&node->proc->inner_lock)); count = 0; hlist_for_each_entry(ref, &node->refs, node_entry) @@ -4644,11 +4694,9 @@ static void print_binder_node_nlocked(struct seq_file *m, } seq_puts(m, "\n"); if (node->proc) { - binder_inner_proc_lock(node->proc); list_for_each_entry(w, &node->async_todo, entry) print_binder_work_ilocked(m, " ", " pending async transaction", w); - binder_inner_proc_unlock(node->proc); } } @@ -4670,6 +4718,7 @@ static void print_binder_proc(struct seq_file *m, struct rb_node *n; size_t start_pos = m->count; size_t header_pos; + struct binder_node *last_node = NULL; seq_printf(m, "proc %d\n", proc->pid); seq_printf(m, "context %s\n", proc->context->name); @@ -4679,15 +4728,30 @@ static void print_binder_proc(struct seq_file *m, for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread, rb_node), print_all); - binder_inner_proc_unlock(proc); + for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { struct binder_node *node = rb_entry(n, struct binder_node, rb_node); - binder_node_lock(node); - if (print_all || node->has_async_transaction) - print_binder_node_nlocked(m, node); - binder_node_unlock(node); + /* + * take a temporary reference on the node so it + * survives and isn't removed from the tree + * while we print it. + */ + binder_inc_node_tmpref_ilocked(node); + /* Need to drop inner lock to take node lock */ + binder_inner_proc_unlock(proc); + if (last_node) + binder_put_node(last_node); + binder_node_inner_lock(node); + print_binder_node_nilocked(m, node); + binder_node_inner_unlock(node); + last_node = node; + binder_inner_proc_lock(proc); } + binder_inner_proc_unlock(proc); + if (last_node) + binder_put_node(last_node); + if (print_all) { for (n = rb_first(&proc->refs_by_desc); n != NULL; @@ -4823,8 +4887,10 @@ static void print_binder_proc_stats(struct seq_file *m, proc->ready_threads, binder_alloc_get_free_async_space(&proc->alloc)); count = 0; + binder_inner_proc_lock(proc); for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) count++; + binder_inner_proc_unlock(proc); seq_printf(m, " nodes: %d\n", count); count = 0; strong = 0; @@ -4878,7 +4944,7 @@ static int binder_state_show(struct seq_file *m, void *unused) if (last_node) binder_put_node(last_node); binder_node_lock(node); - print_binder_node_nlocked(m, node); + print_binder_node_nilocked(m, node); binder_node_unlock(node); last_node = node; spin_lock(&binder_dead_nodes_lock); From e495123304a5949848881496ee23990791a57db3 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 25 May 2017 15:52:17 -0700 Subject: [PATCH 0928/3220] FROMLIST: binder: protect proc->threads with inner_lock (from https://patchwork.kernel.org/patch/9817775/) proc->threads will need to be accessed with higher locks of other processes held so use proc->inner_lock to protect it. proc->tmp_ref now needs to be protected by proc->inner_lock. Change-Id: I176cfeca16bf7c9b34b428c16405f93db81d2ff8 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 87 +++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 24 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index a0c0bd05a885..8735f9a6c018 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -469,6 +469,7 @@ enum binder_deferred_state { * struct binder_proc - binder process bookkeeping * @proc_node: element for binder_procs list * @threads: rbtree of binder_threads in this proc + * (protected by @inner_lock) * @nodes: rbtree of binder nodes associated with * this proc ordered by node->ptr * (protected by @inner_lock) @@ -486,6 +487,7 @@ enum binder_deferred_state { * (protected by binder_deferred_lock) * @is_dead: process is dead and awaiting free * when outstanding transactions are cleaned up + * (protected by @inner_lock) * @todo: list of work for this process * (protected by @inner_lock) * @wait: wait queue head to wait for proc work @@ -501,6 +503,7 @@ enum binder_deferred_state { * @requested_threads_started: number binder threads started * @ready_threads: number of threads waiting for proc work * @tmp_ref: temporary reference to indicate proc is in use + * (protected by @inner_lock) * @default_priority: default scheduler priority * (invariant after initialized) * @debugfs_entry: debugfs node @@ -556,6 +559,7 @@ enum { * @proc: binder process for this thread * (invariant after initialization) * @rb_node: element for proc->threads rbtree + * (protected by @proc->inner_lock) * @pid: PID for this thread * (invariant after initialization) * @looper: bitmap of looping state @@ -576,6 +580,7 @@ enum { * always be acquired) * @is_dead: thread is dead and awaiting free * when outstanding transactions are cleaned up + * (protected by @proc->inner_lock) * * Bookkeeping structure for binder threads. */ @@ -1668,15 +1673,15 @@ static void binder_thread_dec_tmpref(struct binder_thread *thread) /* * atomic is used to protect the counter value while * it cannot reach zero or thread->is_dead is false - * - * TODO: future patch adds locking to ensure that the - * check of tmp_ref and is_dead is done with a lock held */ + binder_inner_proc_lock(thread->proc); atomic_dec(&thread->tmp_ref); if (thread->is_dead && !atomic_read(&thread->tmp_ref)) { + binder_inner_proc_unlock(thread->proc); binder_free_thread(thread); return; } + binder_inner_proc_unlock(thread->proc); } /** @@ -1693,12 +1698,15 @@ static void binder_thread_dec_tmpref(struct binder_thread *thread) */ static void binder_proc_dec_tmpref(struct binder_proc *proc) { + binder_inner_proc_lock(proc); proc->tmp_ref--; if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) && !proc->tmp_ref) { + binder_inner_proc_unlock(proc); binder_free_proc(proc); return; } + binder_inner_proc_unlock(proc); } /** @@ -2481,7 +2489,9 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_dead_binder; } + binder_inner_proc_lock(target_proc); target_proc->tmp_ref++; + binder_inner_proc_unlock(target_proc); binder_node_unlock(target_node); if (security_binder_transaction(proc->tsk, target_proc->tsk) < 0) { @@ -3855,7 +3865,8 @@ static void binder_release_work(struct binder_proc *proc, } -static struct binder_thread *binder_get_thread(struct binder_proc *proc) +static struct binder_thread *binder_get_thread_ilocked( + struct binder_proc *proc, struct binder_thread *new_thread) { struct binder_thread *thread = NULL; struct rb_node *parent = NULL; @@ -3870,25 +3881,45 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc) else if (current->pid > thread->pid) p = &(*p)->rb_right; else - break; + return thread; } - if (*p == NULL) { - thread = kzalloc(sizeof(*thread), GFP_KERNEL); - if (thread == NULL) + if (!new_thread) + return NULL; + thread = new_thread; + binder_stats_created(BINDER_STAT_THREAD); + thread->proc = proc; + thread->pid = current->pid; + atomic_set(&thread->tmp_ref, 0); + init_waitqueue_head(&thread->wait); + INIT_LIST_HEAD(&thread->todo); + rb_link_node(&thread->rb_node, parent, p); + rb_insert_color(&thread->rb_node, &proc->threads); + thread->looper_need_return = true; + thread->return_error.work.type = BINDER_WORK_RETURN_ERROR; + thread->return_error.cmd = BR_OK; + thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR; + thread->reply_error.cmd = BR_OK; + + return thread; +} + +static struct binder_thread *binder_get_thread(struct binder_proc *proc) +{ + struct binder_thread *thread; + struct binder_thread *new_thread; + + binder_inner_proc_lock(proc); + thread = binder_get_thread_ilocked(proc, NULL); + binder_inner_proc_unlock(proc); + if (!thread) { + new_thread = kzalloc(sizeof(*thread), GFP_KERNEL); + if (new_thread == NULL) return NULL; - binder_stats_created(BINDER_STAT_THREAD); - thread->proc = proc; - thread->pid = current->pid; - atomic_set(&thread->tmp_ref, 0); - init_waitqueue_head(&thread->wait); - INIT_LIST_HEAD(&thread->todo); - rb_link_node(&thread->rb_node, parent, p); - rb_insert_color(&thread->rb_node, &proc->threads); - thread->looper_need_return = true; - thread->return_error.work.type = BINDER_WORK_RETURN_ERROR; - thread->return_error.cmd = BR_OK; - thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR; - thread->reply_error.cmd = BR_OK; + binder_inner_proc_lock(proc); + thread = binder_get_thread_ilocked(proc, new_thread); + binder_inner_proc_unlock(proc); + if (thread != new_thread) + kfree(new_thread); } return thread; } @@ -3919,6 +3950,7 @@ static int binder_thread_release(struct binder_proc *proc, int active_transactions = 0; struct binder_transaction *last_t = NULL; + binder_inner_proc_lock(thread->proc); /* * take a ref on the proc so it survives * after we remove this thread from proc->threads. @@ -3966,6 +3998,7 @@ static int binder_thread_release(struct binder_proc *proc, if (t) spin_lock(&t->lock); } + binder_inner_proc_unlock(thread->proc); if (send_reply) binder_send_failed_reply(send_reply, BR_DEAD_REPLY); @@ -4339,6 +4372,7 @@ static void binder_deferred_flush(struct binder_proc *proc) struct rb_node *n; int wake_count = 0; + binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node); @@ -4348,6 +4382,7 @@ static void binder_deferred_flush(struct binder_proc *proc) wake_count++; } } + binder_inner_proc_unlock(proc); wake_up_interruptible_all(&proc->wait); binder_debug(BINDER_DEBUG_OPEN_CLOSE, @@ -4446,6 +4481,7 @@ static void binder_deferred_release(struct binder_proc *proc) context->binder_context_mgr_node = NULL; } mutex_unlock(&context->context_mgr_node_lock); + binder_inner_proc_lock(proc); /* * Make sure proc stays alive after we * remove all the threads @@ -4459,13 +4495,14 @@ static void binder_deferred_release(struct binder_proc *proc) struct binder_thread *thread; thread = rb_entry(n, struct binder_thread, rb_node); + binder_inner_proc_unlock(proc); threads++; active_transactions += binder_thread_release(proc, thread); + binder_inner_proc_lock(proc); } nodes = 0; incoming_refs = 0; - binder_inner_proc_lock(proc); while ((n = rb_first(&proc->nodes))) { struct binder_node *node; @@ -4873,10 +4910,13 @@ static void print_binder_proc_stats(struct seq_file *m, struct binder_work *w; struct rb_node *n; int count, strong, weak; + size_t free_async_space = + binder_alloc_get_free_async_space(&proc->alloc); seq_printf(m, "proc %d\n", proc->pid); seq_printf(m, "context %s\n", proc->context->name); count = 0; + binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) count++; seq_printf(m, " threads: %d\n", count); @@ -4885,9 +4925,8 @@ static void print_binder_proc_stats(struct seq_file *m, " free async space %zd\n", proc->requested_threads, proc->requested_threads_started, proc->max_threads, proc->ready_threads, - binder_alloc_get_free_async_space(&proc->alloc)); + free_async_space); count = 0; - binder_inner_proc_lock(proc); for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) count++; binder_inner_proc_unlock(proc); From 89b657e0d7de20459820d7edf341bc888b69aa83 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 2 Jun 2017 13:36:52 -0700 Subject: [PATCH 0929/3220] FROMLIST: binder: protect transaction_stack with inner lock. (from https://patchwork.kernel.org/patch/9817779/) This makes future changes to priority inheritance easier, since we want to be able to look at a thread's transaction stack when selecting a thread to inherit priority for. It also allows us to take just a single lock in a few paths, where we used to take two in succession. Change-Id: Idb1b6e9faa5c669978b2b3011fe326be8aece586 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 96 +++++++++++++++++++++++++++++++++------- 1 file changed, 79 insertions(+), 17 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 8735f9a6c018..328ec2cd7956 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -30,7 +30,8 @@ * 3) proc->inner_lock : protects the thread and node lists * (proc->threads, proc->nodes) and all todo lists associated * with the binder_proc (proc->todo, thread->todo, - * proc->delivered_death and node->async_todo). + * proc->delivered_death and node->async_todo), as well as + * thread->transaction_stack * binder_inner_proc_lock() and binder_inner_proc_unlock() * are used to acq/rel * @@ -567,11 +568,13 @@ enum { * @looper_needs_return: looping thread needs to exit driver * (no lock needed) * @transaction_stack: stack of in-progress transactions for this thread + * (protected by @proc->inner_lock) * @todo: list of work to do for this thread * (protected by @proc->inner_lock) * @return_error: transaction errors reported by this thread * (only accessed by this thread) * @reply_error: transaction errors reported by target thread + * (protected by @proc->inner_lock) * @wait: wait queue for thread work * @stats: per-thread statistics * (atomics, no lock needed) @@ -1645,10 +1648,11 @@ static int binder_inc_ref_for_node(struct binder_proc *proc, return ret; } -static void binder_pop_transaction(struct binder_thread *target_thread, - struct binder_transaction *t) +static void binder_pop_transaction_ilocked(struct binder_thread *target_thread, + struct binder_transaction *t) { BUG_ON(!target_thread); + BUG_ON(!spin_is_locked(&target_thread->proc->inner_lock)); BUG_ON(target_thread->transaction_stack != t); BUG_ON(target_thread->transaction_stack->from != target_thread); target_thread->transaction_stack = @@ -1732,6 +1736,35 @@ static struct binder_thread *binder_get_txn_from( return from; } +/** + * binder_get_txn_from_and_acq_inner() - get t->from and acquire inner lock + * @t: binder transaction for t->from + * + * Same as binder_get_txn_from() except it also acquires the proc->inner_lock + * to guarantee that the thread cannot be released while operating on it. + * The caller must call binder_inner_proc_unlock() to release the inner lock + * as well as call binder_dec_thread_txn() to release the reference. + * + * Return: the value of t->from + */ +static struct binder_thread *binder_get_txn_from_and_acq_inner( + struct binder_transaction *t) +{ + struct binder_thread *from; + + from = binder_get_txn_from(t); + if (!from) + return NULL; + binder_inner_proc_lock(from->proc); + if (t->from) { + BUG_ON(from != t->from); + return from; + } + binder_inner_proc_unlock(from->proc); + binder_thread_dec_tmpref(from); + return NULL; +} + static void binder_free_transaction(struct binder_transaction *t) { if (t->buffer) @@ -1748,7 +1781,7 @@ static void binder_send_failed_reply(struct binder_transaction *t, BUG_ON(t->flags & TF_ONE_WAY); while (1) { - target_thread = binder_get_txn_from(t); + target_thread = binder_get_txn_from_and_acq_inner(t); if (target_thread) { binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "send failed reply for transaction %d to %d:%d\n", @@ -1756,11 +1789,10 @@ static void binder_send_failed_reply(struct binder_transaction *t, target_thread->proc->pid, target_thread->pid); - binder_pop_transaction(target_thread, t); + binder_pop_transaction_ilocked(target_thread, t); if (target_thread->reply_error.cmd == BR_OK) { target_thread->reply_error.cmd = error_code; - binder_enqueue_work( - target_thread->proc, + binder_enqueue_work_ilocked( &target_thread->reply_error.work, &target_thread->todo); wake_up_interruptible(&target_thread->wait); @@ -1768,6 +1800,7 @@ static void binder_send_failed_reply(struct binder_transaction *t, WARN(1, "Unexpected reply error: %u\n", target_thread->reply_error.cmd); } + binder_inner_proc_unlock(target_thread->proc); binder_thread_dec_tmpref(target_thread); binder_free_transaction(t); return; @@ -2397,8 +2430,10 @@ static void binder_transaction(struct binder_proc *proc, e->context_name = proc->context->name; if (reply) { + binder_inner_proc_lock(proc); in_reply_to = thread->transaction_stack; if (in_reply_to == NULL) { + binder_inner_proc_unlock(proc); binder_user_error("%d:%d got reply transaction with no transaction stack\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; @@ -2406,7 +2441,6 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_empty_call_stack; } - binder_set_nice(in_reply_to->saved_priority); if (in_reply_to->to_thread != thread) { spin_lock(&in_reply_to->lock); binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n", @@ -2416,6 +2450,7 @@ static void binder_transaction(struct binder_proc *proc, in_reply_to->to_thread ? in_reply_to->to_thread->pid : 0); spin_unlock(&in_reply_to->lock); + binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; @@ -2423,7 +2458,9 @@ static void binder_transaction(struct binder_proc *proc, goto err_bad_call_stack; } thread->transaction_stack = in_reply_to->to_parent; - target_thread = binder_get_txn_from(in_reply_to); + binder_inner_proc_unlock(proc); + binder_set_nice(in_reply_to->saved_priority); + target_thread = binder_get_txn_from_and_acq_inner(in_reply_to); if (target_thread == NULL) { return_error = BR_DEAD_REPLY; return_error_line = __LINE__; @@ -2435,6 +2472,7 @@ static void binder_transaction(struct binder_proc *proc, target_thread->transaction_stack ? target_thread->transaction_stack->debug_id : 0, in_reply_to->debug_id); + binder_inner_proc_unlock(target_thread->proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; @@ -2444,6 +2482,7 @@ static void binder_transaction(struct binder_proc *proc, } target_proc = target_thread->proc; target_proc->tmp_ref++; + binder_inner_proc_unlock(target_thread->proc); } else { if (tr->target.handle) { struct binder_ref *ref; @@ -2500,6 +2539,7 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_invalid_target_handle; } + binder_inner_proc_lock(proc); if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) { struct binder_transaction *tmp; @@ -2512,6 +2552,7 @@ static void binder_transaction(struct binder_proc *proc, tmp->to_thread ? tmp->to_thread->pid : 0); spin_unlock(&tmp->lock); + binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; @@ -2532,6 +2573,7 @@ static void binder_transaction(struct binder_proc *proc, tmp = tmp->from_parent; } } + binder_inner_proc_unlock(proc); } if (target_thread) { e->to_thread = target_thread->pid; @@ -2812,23 +2854,34 @@ static void binder_transaction(struct binder_proc *proc, t->work.type = BINDER_WORK_TRANSACTION; if (reply) { - if (target_thread->is_dead) + binder_inner_proc_lock(target_proc); + if (target_thread->is_dead) { + binder_inner_proc_unlock(target_proc); goto err_dead_proc_or_thread; + } BUG_ON(t->buffer->async_transaction != 0); - binder_pop_transaction(target_thread, in_reply_to); + binder_pop_transaction_ilocked(target_thread, in_reply_to); + binder_enqueue_work_ilocked(&t->work, target_list); + binder_inner_proc_unlock(target_proc); binder_free_transaction(in_reply_to); - binder_enqueue_work(target_proc, &t->work, target_list); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); + binder_inner_proc_lock(proc); t->need_reply = 1; t->from_parent = thread->transaction_stack; thread->transaction_stack = t; + binder_inner_proc_unlock(proc); + binder_inner_proc_lock(target_proc); if (target_proc->is_dead || (target_thread && target_thread->is_dead)) { - binder_pop_transaction(thread, t); + binder_inner_proc_unlock(target_proc); + binder_inner_proc_lock(proc); + binder_pop_transaction_ilocked(thread, t); + binder_inner_proc_unlock(proc); goto err_dead_proc_or_thread; } - binder_enqueue_work(target_proc, &t->work, target_list); + binder_enqueue_work_ilocked(&t->work, target_list); + binder_inner_proc_unlock(target_proc); } else { BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); @@ -2843,12 +2896,15 @@ static void binder_transaction(struct binder_proc *proc, * must be atomic with enqueue on * async_todo */ + binder_inner_proc_lock(target_proc); if (target_proc->is_dead || (target_thread && target_thread->is_dead)) { + binder_inner_proc_unlock(target_proc); binder_node_unlock(target_node); goto err_dead_proc_or_thread; } - binder_enqueue_work(target_proc, &t->work, target_list); + binder_enqueue_work_ilocked(&t->work, target_list); + binder_inner_proc_unlock(target_proc); binder_node_unlock(target_node); } if (target_wait) { @@ -3465,8 +3521,10 @@ static int binder_thread_read(struct binder_proc *proc, } retry: + binder_inner_proc_lock(proc); wait_for_proc_work = thread->transaction_stack == NULL && - binder_worklist_empty(proc, &thread->todo); + binder_worklist_empty_ilocked(&thread->todo); + binder_inner_proc_unlock(proc); thread->looper |= BINDER_LOOPER_STATE_WAITING; if (wait_for_proc_work) @@ -3778,9 +3836,11 @@ retry: binder_thread_dec_tmpref(t_from); t->buffer->allow_user_free = 1; if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) { + binder_inner_proc_lock(thread->proc); t->to_parent = thread->transaction_stack; t->to_thread = thread; thread->transaction_stack = t; + binder_inner_proc_unlock(thread->proc); } else { binder_free_transaction(t); } @@ -4018,8 +4078,10 @@ static unsigned int binder_poll(struct file *filp, thread = binder_get_thread(proc); + binder_inner_proc_lock(thread->proc); wait_for_proc_work = thread->transaction_stack == NULL && - binder_worklist_empty(proc, &thread->todo); + binder_worklist_empty_ilocked(&thread->todo); + binder_inner_proc_unlock(thread->proc); binder_unlock(__func__); From 814ce251cb48216fd254fb78db26fba9e363b953 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 25 May 2017 17:35:02 -0700 Subject: [PATCH 0930/3220] FROMLIST: binder: use inner lock to protect thread accounting (from https://patchwork.kernel.org/patch/9817763/) Use the inner lock to protect thread accounting fields in proc structure: max_threads, requested_threads, requested_threads_started and ready_threads. Change-Id: I5a17eb68812702f803d4e2806e7887de0b3af18e Signed-off-by: Todd Kjos --- drivers/android/binder.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 328ec2cd7956..fdafd7df9171 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -498,11 +498,15 @@ enum binder_deferred_state { * @delivered_death: list of delivered death notification * (protected by @inner_lock) * @max_threads: cap on number of binder threads + * (protected by @inner_lock) * @requested_threads: number of binder threads requested but not * yet started. In current implementation, can * only be 0 or 1. + * (protected by @inner_lock) * @requested_threads_started: number binder threads started + * (protected by @inner_lock) * @ready_threads: number of threads waiting for proc work + * (protected by @inner_lock) * @tmp_ref: temporary reference to indicate proc is in use * (protected by @inner_lock) * @default_priority: default scheduler priority @@ -3235,6 +3239,7 @@ static int binder_thread_write(struct binder_proc *proc, binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_REGISTER_LOOPER\n", proc->pid, thread->pid); + binder_inner_proc_lock(proc); if (thread->looper & BINDER_LOOPER_STATE_ENTERED) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n", @@ -3248,6 +3253,7 @@ static int binder_thread_write(struct binder_proc *proc, proc->requested_threads_started++; } thread->looper |= BINDER_LOOPER_STATE_REGISTERED; + binder_inner_proc_unlock(proc); break; case BC_ENTER_LOOPER: binder_debug(BINDER_DEBUG_THREADS, @@ -3524,11 +3530,11 @@ retry: binder_inner_proc_lock(proc); wait_for_proc_work = thread->transaction_stack == NULL && binder_worklist_empty_ilocked(&thread->todo); + if (wait_for_proc_work) + proc->ready_threads++; binder_inner_proc_unlock(proc); thread->looper |= BINDER_LOOPER_STATE_WAITING; - if (wait_for_proc_work) - proc->ready_threads++; binder_unlock(__func__); @@ -3559,8 +3565,10 @@ retry: binder_lock(__func__); + binder_inner_proc_lock(proc); if (wait_for_proc_work) proc->ready_threads--; + binder_inner_proc_unlock(proc); thread->looper &= ~BINDER_LOOPER_STATE_WAITING; if (ret) @@ -3850,19 +3858,22 @@ retry: done: *consumed = ptr - buffer; + binder_inner_proc_lock(proc); if (proc->requested_threads + proc->ready_threads == 0 && proc->requested_threads_started < proc->max_threads && (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */ /*spawn a new thread if we leave this out */) { proc->requested_threads++; + binder_inner_proc_unlock(proc); binder_debug(BINDER_DEBUG_THREADS, "%d:%d BR_SPAWN_LOOPER\n", proc->pid, thread->pid); if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer)) return -EFAULT; binder_stat_br(proc, thread, BR_SPAWN_LOOPER); - } + } else + binder_inner_proc_unlock(proc); return 0; } @@ -4242,12 +4253,19 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (ret) goto err; break; - case BINDER_SET_MAX_THREADS: - if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) { + case BINDER_SET_MAX_THREADS: { + int max_threads; + + if (copy_from_user(&max_threads, ubuf, + sizeof(max_threads))) { ret = -EINVAL; goto err; } + binder_inner_proc_lock(proc); + proc->max_threads = max_threads; + binder_inner_proc_unlock(proc); break; + } case BINDER_SET_CONTEXT_MGR: ret = binder_ioctl_set_ctx_mgr(filp); if (ret) From 6fcb2b9ac4aed1946358073c3a2802aa8bd57c3d Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 20 Oct 2016 16:43:34 -0700 Subject: [PATCH 0931/3220] FROMLIST: binder: protect binder_ref with outer lock (from https://patchwork.kernel.org/patch/9817771/) Use proc->outer_lock to protect the binder_ref structure. The outer lock allows functions operating on the binder_ref to do nested acquires of node and inner locks as necessary to attach refs to nodes atomically. Binder refs must never be accesssed without holding the outer lock. Change-Id: Icf6add0eddf70473b39239960b2d9a524775b53a Signed-off-by: Todd Kjos --- drivers/android/binder.c | 133 ++++++++++++++++++++++++--------------- 1 file changed, 83 insertions(+), 50 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index fdafd7df9171..2ef3022d969d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -475,7 +475,9 @@ enum binder_deferred_state { * this proc ordered by node->ptr * (protected by @inner_lock) * @refs_by_desc: rbtree of refs ordered by ref->desc + * (protected by @outer_lock) * @refs_by_node: rbtree of refs ordered by ref->node + * (protected by @outer_lock) * @pid PID of group_leader of process * (invariant after initialized) * @tsk task_struct for group_leader of process @@ -1270,8 +1272,8 @@ static void binder_put_node(struct binder_node *node) binder_dec_node_tmpref(node); } -static struct binder_ref *binder_get_ref(struct binder_proc *proc, - u32 desc, bool need_strong_ref) +static struct binder_ref *binder_get_ref_olocked(struct binder_proc *proc, + u32 desc, bool need_strong_ref) { struct rb_node *n = proc->refs_by_desc.rb_node; struct binder_ref *ref; @@ -1294,7 +1296,7 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc, } /** - * binder_get_ref_for_node() - get the ref associated with given node + * binder_get_ref_for_node_olocked() - get the ref associated with given node * @proc: binder_proc that owns the ref * @node: binder_node of target * @new_ref: newly allocated binder_ref to be initialized or %NULL @@ -1311,9 +1313,10 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc, * new_ref. new_ref must be kfree'd by the caller in * this case. */ -static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, - struct binder_node *node, - struct binder_ref *new_ref) +static struct binder_ref *binder_get_ref_for_node_olocked( + struct binder_proc *proc, + struct binder_node *node, + struct binder_ref *new_ref) { struct binder_context *context = proc->context; struct rb_node **p = &proc->refs_by_node.rb_node; @@ -1376,7 +1379,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, return new_ref; } -static void binder_cleanup_ref(struct binder_ref *ref) +static void binder_cleanup_ref_olocked(struct binder_ref *ref) { bool delete_node = false; @@ -1419,17 +1422,17 @@ static void binder_cleanup_ref(struct binder_ref *ref) } /** - * binder_inc_ref() - increment the ref for given handle + * binder_inc_ref_olocked() - increment the ref for given handle * @ref: ref to be incremented * @strong: if true, strong increment, else weak * @target_list: list to queue node work on * - * Increment the ref. + * Increment the ref. @ref->proc->outer_lock must be held on entry * * Return: 0, if successful, else errno */ -static int binder_inc_ref(struct binder_ref *ref, int strong, - struct list_head *target_list) +static int binder_inc_ref_olocked(struct binder_ref *ref, int strong, + struct list_head *target_list) { int ret; @@ -1458,12 +1461,9 @@ static int binder_inc_ref(struct binder_ref *ref, int strong, * * Decrement the ref. * - * TODO: kfree is avoided here since an upcoming patch - * will put this under a lock. - * * Return: true if ref is cleaned up and ready to be freed */ -static bool binder_dec_ref(struct binder_ref *ref, int strong) +static bool binder_dec_ref_olocked(struct binder_ref *ref, int strong) { if (strong) { if (ref->data.strong == 0) { @@ -1487,13 +1487,7 @@ static bool binder_dec_ref(struct binder_ref *ref, int strong) ref->data.weak--; } if (ref->data.strong == 0 && ref->data.weak == 0) { - binder_cleanup_ref(ref); - /* - * TODO: we could kfree(ref) here, but an upcoming - * patch will call this with a lock held, so we - * return an indication that the ref should be - * freed. - */ + binder_cleanup_ref_olocked(ref); return true; } return false; @@ -1518,7 +1512,8 @@ static struct binder_node *binder_get_node_from_ref( struct binder_node *node; struct binder_ref *ref; - ref = binder_get_ref(proc, desc, need_strong_ref); + binder_proc_lock(proc); + ref = binder_get_ref_olocked(proc, desc, need_strong_ref); if (!ref) goto err_no_ref; node = ref->node; @@ -1529,10 +1524,12 @@ static struct binder_node *binder_get_node_from_ref( binder_inc_node_tmpref(node); if (rdata) *rdata = ref->data; + binder_proc_unlock(proc); return node; err_no_ref: + binder_proc_unlock(proc); return NULL; } @@ -1572,24 +1569,27 @@ static int binder_update_ref_for_handle(struct binder_proc *proc, struct binder_ref *ref; bool delete_ref = false; - ref = binder_get_ref(proc, desc, strong); + binder_proc_lock(proc); + ref = binder_get_ref_olocked(proc, desc, strong); if (!ref) { ret = -EINVAL; goto err_no_ref; } if (increment) - ret = binder_inc_ref(ref, strong, NULL); + ret = binder_inc_ref_olocked(ref, strong, NULL); else - delete_ref = binder_dec_ref(ref, strong); + delete_ref = binder_dec_ref_olocked(ref, strong); if (rdata) *rdata = ref->data; + binder_proc_unlock(proc); if (delete_ref) binder_free_ref(ref); return ret; err_no_ref: + binder_proc_unlock(proc); return ret; } @@ -1634,15 +1634,19 @@ static int binder_inc_ref_for_node(struct binder_proc *proc, struct binder_ref *new_ref = NULL; int ret = 0; - ref = binder_get_ref_for_node(proc, node, NULL); + binder_proc_lock(proc); + ref = binder_get_ref_for_node_olocked(proc, node, NULL); if (!ref) { + binder_proc_unlock(proc); new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!new_ref) return -ENOMEM; - ref = binder_get_ref_for_node(proc, node, new_ref); + binder_proc_lock(proc); + ref = binder_get_ref_for_node_olocked(proc, node, new_ref); } - ret = binder_inc_ref(ref, strong, target_list); + ret = binder_inc_ref_olocked(ref, strong, target_list); *rdata = ref->data; + binder_proc_unlock(proc); if (new_ref && ref != new_ref) /* * Another thread created the ref first so @@ -2498,11 +2502,14 @@ static void binder_transaction(struct binder_proc *proc, * stays alive until the transaction is * done. */ - ref = binder_get_ref(proc, tr->target.handle, true); + binder_proc_lock(proc); + ref = binder_get_ref_olocked(proc, tr->target.handle, + true); if (ref) { binder_inc_node(ref->node, 1, 0, NULL); target_node = ref->node; } + binder_proc_unlock(proc); if (target_node == NULL) { binder_user_error("%d:%d got transaction to invalid handle\n", proc->pid, thread->pid); @@ -3278,7 +3285,7 @@ static int binder_thread_write(struct binder_proc *proc, uint32_t target; binder_uintptr_t cookie; struct binder_ref *ref; - struct binder_ref_death *death; + struct binder_ref_death *death = NULL; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; @@ -3286,7 +3293,29 @@ static int binder_thread_write(struct binder_proc *proc, if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); - ref = binder_get_ref(proc, target, false); + if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { + /* + * Allocate memory for death notification + * before taking lock + */ + death = kzalloc(sizeof(*death), GFP_KERNEL); + if (death == NULL) { + WARN_ON(thread->return_error.cmd != + BR_OK); + thread->return_error.cmd = BR_ERROR; + binder_enqueue_work( + thread->proc, + &thread->return_error.work, + &thread->todo); + binder_debug( + BINDER_DEBUG_FAILED_TRANSACTION, + "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", + proc->pid, thread->pid); + break; + } + } + binder_proc_lock(proc); + ref = binder_get_ref_olocked(proc, target, false); if (ref == NULL) { binder_user_error("%d:%d %s invalid ref %d\n", proc->pid, thread->pid, @@ -3294,6 +3323,8 @@ static int binder_thread_write(struct binder_proc *proc, "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", target); + binder_proc_unlock(proc); + kfree(death); break; } @@ -3311,20 +3342,8 @@ static int binder_thread_write(struct binder_proc *proc, if (ref->death) { binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n", proc->pid, thread->pid); - break; - } - death = kzalloc(sizeof(*death), GFP_KERNEL); - if (death == NULL) { - WARN_ON(thread->return_error.cmd != - BR_OK); - thread->return_error.cmd = BR_ERROR; - binder_enqueue_work( - thread->proc, - &thread->return_error.work, - &thread->todo); - binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", - proc->pid, thread->pid); + binder_proc_unlock(proc); + kfree(death); break; } binder_stats_created(BINDER_STAT_DEATH); @@ -3357,6 +3376,7 @@ static int binder_thread_write(struct binder_proc *proc, binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n", proc->pid, thread->pid); binder_node_unlock(ref->node); + binder_proc_unlock(proc); break; } death = ref->death; @@ -3366,6 +3386,7 @@ static int binder_thread_write(struct binder_proc *proc, (u64)death->cookie, (u64)cookie); binder_node_unlock(ref->node); + binder_proc_unlock(proc); break; } ref->death = NULL; @@ -3392,6 +3413,7 @@ static int binder_thread_write(struct binder_proc *proc, binder_inner_proc_unlock(proc); binder_node_unlock(ref->node); } + binder_proc_unlock(proc); } break; case BC_DEAD_BINDER_DONE: { struct binder_work *w; @@ -4602,14 +4624,18 @@ static void binder_deferred_release(struct binder_proc *proc) binder_inner_proc_unlock(proc); outgoing_refs = 0; + binder_proc_lock(proc); while ((n = rb_first(&proc->refs_by_desc))) { struct binder_ref *ref; ref = rb_entry(n, struct binder_ref, rb_node_desc); outgoing_refs++; - binder_cleanup_ref(ref); + binder_cleanup_ref_olocked(ref); + binder_proc_unlock(proc); binder_free_ref(ref); + binder_proc_lock(proc); } + binder_proc_unlock(proc); binder_release_work(proc, &proc->todo); binder_release_work(proc, &proc->delivered_death); @@ -4817,8 +4843,10 @@ static void print_binder_node_nilocked(struct seq_file *m, } } -static void print_binder_ref(struct seq_file *m, struct binder_ref *ref) +static void print_binder_ref_olocked(struct seq_file *m, + struct binder_ref *ref) { + WARN_ON(!spin_is_locked(&ref->proc->outer_lock)); binder_node_lock(ref->node); seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %pK\n", ref->data.debug_id, ref->data.desc, @@ -4870,11 +4898,14 @@ static void print_binder_proc(struct seq_file *m, binder_put_node(last_node); if (print_all) { + binder_proc_lock(proc); for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) - print_binder_ref(m, rb_entry(n, struct binder_ref, - rb_node_desc)); + print_binder_ref_olocked(m, rb_entry(n, + struct binder_ref, + rb_node_desc)); + binder_proc_unlock(proc); } binder_alloc_print_allocated(m, &proc->alloc); binder_inner_proc_lock(proc); @@ -5014,6 +5045,7 @@ static void print_binder_proc_stats(struct seq_file *m, count = 0; strong = 0; weak = 0; + binder_proc_lock(proc); for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { struct binder_ref *ref = rb_entry(n, struct binder_ref, rb_node_desc); @@ -5021,6 +5053,7 @@ static void print_binder_proc_stats(struct seq_file *m, strong += ref->data.strong; weak += ref->data.weak; } + binder_proc_unlock(proc); seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak); count = binder_alloc_get_allocated_count(&proc->alloc); From da957e45dd21fee345544dadf0fc426abe503d28 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Fri, 21 Apr 2017 14:32:11 -0700 Subject: [PATCH 0932/3220] FROMLIST: binder: protect against stale pointers in print_binder_transaction (from https://patchwork.kernel.org/patch/9817761/) When printing transactions there were several race conditions that could cause a stale pointer to be deferenced. Fixed by reading the pointer once and using it if valid (which is safe). The transaction buffer also needed protection via proc lock, so it is only printed if we are holding the correct lock. Bug: 36650912 Test: tested manually Change-Id: I78240f99cc1a070d70a841c0d84d4306e2fd528d Signed-off-by: Todd Kjos --- drivers/android/binder.c | 60 ++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 2ef3022d969d..2e5016879654 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4703,35 +4703,52 @@ binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer) mutex_unlock(&binder_deferred_lock); } -static void print_binder_transaction(struct seq_file *m, const char *prefix, - struct binder_transaction *t) +static void print_binder_transaction_ilocked(struct seq_file *m, + struct binder_proc *proc, + const char *prefix, + struct binder_transaction *t) { + struct binder_proc *to_proc; + struct binder_buffer *buffer = t->buffer; + + WARN_ON(!spin_is_locked(&proc->inner_lock)); spin_lock(&t->lock); + to_proc = t->to_proc; seq_printf(m, "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d", prefix, t->debug_id, t, t->from ? t->from->proc->pid : 0, t->from ? t->from->pid : 0, - t->to_proc ? t->to_proc->pid : 0, + to_proc ? to_proc->pid : 0, t->to_thread ? t->to_thread->pid : 0, t->code, t->flags, t->priority, t->need_reply); spin_unlock(&t->lock); - if (t->buffer == NULL) { + if (proc != to_proc) { + /* + * Can only safely deref buffer if we are holding the + * correct proc inner lock for this node + */ + seq_puts(m, "\n"); + return; + } + + if (buffer == NULL) { seq_puts(m, " buffer free\n"); return; } - if (t->buffer->target_node) - seq_printf(m, " node %d", - t->buffer->target_node->debug_id); + if (buffer->target_node) + seq_printf(m, " node %d", buffer->target_node->debug_id); seq_printf(m, " size %zd:%zd data %p\n", - t->buffer->data_size, t->buffer->offsets_size, - t->buffer->data); + buffer->data_size, buffer->offsets_size, + buffer->data); } -static void print_binder_work_ilocked(struct seq_file *m, const char *prefix, - const char *transaction_prefix, - struct binder_work *w) +static void print_binder_work_ilocked(struct seq_file *m, + struct binder_proc *proc, + const char *prefix, + const char *transaction_prefix, + struct binder_work *w) { struct binder_node *node; struct binder_transaction *t; @@ -4739,7 +4756,8 @@ static void print_binder_work_ilocked(struct seq_file *m, const char *prefix, switch (w->type) { case BINDER_WORK_TRANSACTION: t = container_of(w, struct binder_transaction, work); - print_binder_transaction(m, transaction_prefix, t); + print_binder_transaction_ilocked( + m, proc, transaction_prefix, t); break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( @@ -4790,20 +4808,21 @@ static void print_binder_thread_ilocked(struct seq_file *m, t = thread->transaction_stack; while (t) { if (t->from == thread) { - print_binder_transaction(m, - " outgoing transaction", t); + print_binder_transaction_ilocked(m, thread->proc, + " outgoing transaction", t); t = t->from_parent; } else if (t->to_thread == thread) { - print_binder_transaction(m, + print_binder_transaction_ilocked(m, thread->proc, " incoming transaction", t); t = t->to_parent; } else { - print_binder_transaction(m, " bad transaction", t); + print_binder_transaction_ilocked(m, thread->proc, + " bad transaction", t); t = NULL; } } list_for_each_entry(w, &thread->todo, entry) { - print_binder_work_ilocked(m, " ", + print_binder_work_ilocked(m, thread->proc, " ", " pending transaction", w); } if (!print_always && m->count == header_pos) @@ -4838,7 +4857,7 @@ static void print_binder_node_nilocked(struct seq_file *m, seq_puts(m, "\n"); if (node->proc) { list_for_each_entry(w, &node->async_todo, entry) - print_binder_work_ilocked(m, " ", + print_binder_work_ilocked(m, node->proc, " ", " pending async transaction", w); } } @@ -4910,7 +4929,8 @@ static void print_binder_proc(struct seq_file *m, binder_alloc_print_allocated(m, &proc->alloc); binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) - print_binder_work_ilocked(m, " ", " pending transaction", w); + print_binder_work_ilocked(m, proc, " ", + " pending transaction", w); list_for_each_entry(w, &proc->delivered_death, entry) { seq_puts(m, " has delivered dead binder\n"); break; From 6c8ad5b3f0366b8b175d8ee4223fc8dbafc39991 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Mon, 22 May 2017 11:26:23 -0700 Subject: [PATCH 0933/3220] FROMLIST: binder: fix death race conditions (from https://patchwork.kernel.org/patch/9817765/) A race existed where one thread could register a death notification for a node, while another thread was cleaning up that node and sending out death notifications for its references, causing simultaneous access to ref->death because different locks were held. Test: boots, manual testing Change-Id: Iff73312f34f70374f417beba4c4c82dd33cac119 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 64 ++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 2e5016879654..a161e55d1373 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -442,6 +442,7 @@ struct binder_ref_data { * ref for deletion in binder_cleanup_ref, a non-NULL * @node indicates the node must be freed * @death: pointer to death notification (ref_death) if requested + * (protected by @node->lock) * * Structure to track references from procA to target node (on procB). This * structure is unsafe to access without holding @proc->outer_lock. @@ -3338,10 +3339,12 @@ static int binder_thread_write(struct binder_proc *proc, ref->data.desc, ref->data.strong, ref->data.weak, ref->node->debug_id); + binder_node_lock(ref->node); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { if (ref->death) { binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n", proc->pid, thread->pid); + binder_node_unlock(ref->node); binder_proc_unlock(proc); kfree(death); break; @@ -3350,7 +3353,6 @@ static int binder_thread_write(struct binder_proc *proc, INIT_LIST_HEAD(&death->work.entry); death->cookie = cookie; ref->death = death; - binder_node_lock(ref->node); if (ref->node->proc == NULL) { ref->death->work.type = BINDER_WORK_DEAD_BINDER; if (thread->looper & @@ -3369,9 +3371,7 @@ static int binder_thread_write(struct binder_proc *proc, &proc->wait); } } - binder_node_unlock(ref->node); } else { - binder_node_lock(ref->node); if (ref->death == NULL) { binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n", proc->pid, thread->pid); @@ -3411,8 +3411,8 @@ static int binder_thread_write(struct binder_proc *proc, death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR; } binder_inner_proc_unlock(proc); - binder_node_unlock(ref->node); } + binder_node_unlock(ref->node); binder_proc_unlock(proc); } break; case BC_DEAD_BINDER_DONE: { @@ -3749,44 +3749,39 @@ retry: case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { struct binder_ref_death *death; uint32_t cmd; + binder_uintptr_t cookie; death = container_of(w, struct binder_ref_death, work); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE; else cmd = BR_DEAD_BINDER; - /* - * TODO: there is a race condition between - * death notification requests and delivery - * of the notifications. This will be handled - * in a later patch. - */ - binder_inner_proc_unlock(proc); - if (put_user(cmd, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - if (put_user(death->cookie, - (binder_uintptr_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(binder_uintptr_t); - binder_stat_br(proc, thread, cmd); + cookie = death->cookie; + binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, "%d:%d %s %016llx\n", proc->pid, thread->pid, cmd == BR_DEAD_BINDER ? "BR_DEAD_BINDER" : "BR_CLEAR_DEATH_NOTIFICATION_DONE", - (u64)death->cookie); - + (u64)cookie); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) { + binder_inner_proc_unlock(proc); kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } else { - binder_inner_proc_lock(proc); binder_enqueue_work_ilocked( w, &proc->delivered_death); binder_inner_proc_unlock(proc); } + if (put_user(cmd, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + if (put_user(cookie, + (binder_uintptr_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(binder_uintptr_t); + binder_stat_br(proc, thread, cmd); if (cmd == BR_DEAD_BINDER) goto done; /* DEAD_BINDER notifications can cause transactions */ } break; @@ -4536,20 +4531,25 @@ static int binder_node_release(struct binder_node *node, int refs) hlist_for_each_entry(ref, &node->refs, node_entry) { refs++; - - if (!ref->death) + /* + * Need the node lock to synchronize + * with new notification requests and the + * inner lock to synchronize with queued + * death notifications. + */ + binder_inner_proc_lock(ref->proc); + if (!ref->death) { + binder_inner_proc_unlock(ref->proc); continue; + } death++; - binder_inner_proc_lock(ref->proc); - if (list_empty(&ref->death->work.entry)) { - ref->death->work.type = BINDER_WORK_DEAD_BINDER; - binder_enqueue_work_ilocked(&ref->death->work, - &ref->proc->todo); - wake_up_interruptible(&ref->proc->wait); - } else - BUG(); + BUG_ON(!list_empty(&ref->death->work.entry)); + ref->death->work.type = BINDER_WORK_DEAD_BINDER; + binder_enqueue_work_ilocked(&ref->death->work, + &ref->proc->todo); + wake_up_interruptible(&ref->proc->wait); binder_inner_proc_unlock(ref->proc); } From 8881f118f526914c4705db04e6ea2e8e5f7023d4 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 14 Nov 2016 11:37:41 -0800 Subject: [PATCH 0934/3220] FROMLIST: binder: remove global binder lock (from https://patchwork.kernel.org/patch/9817773/) Remove global mutex and rely on fine-grained locking Change-Id: Idd1ae2e52d654e5dd76d443a1ff97522e687fd4c Signed-off-by: Todd Kjos --- drivers/android/binder.c | 46 +++------------------------------------- 1 file changed, 3 insertions(+), 43 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index a161e55d1373..5753665f76ee 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -79,8 +79,6 @@ #include "binder_alloc.h" #include "binder_trace.h" -static DEFINE_MUTEX(binder_main_lock); - static HLIST_HEAD(binder_deferred_list); static DEFINE_MUTEX(binder_deferred_lock); @@ -924,19 +922,6 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd) return retval; } -static inline void binder_lock(const char *tag) -{ - trace_binder_lock(tag); - mutex_lock(&binder_main_lock); - trace_binder_locked(tag); -} - -static inline void binder_unlock(const char *tag) -{ - trace_binder_unlock(tag); - mutex_unlock(&binder_main_lock); -} - static void binder_set_nice(long nice) { long min_nice; @@ -3558,8 +3543,6 @@ retry: thread->looper |= BINDER_LOOPER_STATE_WAITING; - binder_unlock(__func__); - trace_binder_wait_for_work(wait_for_proc_work, !!thread->transaction_stack, !binder_worklist_empty(proc, &thread->todo)); @@ -3585,8 +3568,6 @@ retry: ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread)); } - binder_lock(__func__); - binder_inner_proc_lock(proc); if (wait_for_proc_work) proc->ready_threads--; @@ -4102,8 +4083,6 @@ static unsigned int binder_poll(struct file *filp, struct binder_thread *thread = NULL; int wait_for_proc_work; - binder_lock(__func__); - thread = binder_get_thread(proc); binder_inner_proc_lock(thread->proc); @@ -4111,8 +4090,6 @@ static unsigned int binder_poll(struct file *filp, binder_worklist_empty_ilocked(&thread->todo); binder_inner_proc_unlock(thread->proc); - binder_unlock(__func__); - if (wait_for_proc_work) { if (binder_has_proc_work(proc, thread)) return POLLIN; @@ -4257,7 +4234,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (ret) goto err_unlocked; - binder_lock(__func__); thread = binder_get_thread(proc); if (thread == NULL) { ret = -ENOMEM; @@ -4316,7 +4292,6 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err: if (thread) thread->looper_need_return = false; - binder_unlock(__func__); wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret && ret != -ERESTARTSYS) pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); @@ -4422,15 +4397,11 @@ static int binder_open(struct inode *nodp, struct file *filp) proc->context = &binder_dev->context; binder_alloc_init(&proc->alloc); - binder_lock(__func__); - binder_stats_created(BINDER_STAT_PROC); proc->pid = current->group_leader->pid; INIT_LIST_HEAD(&proc->delivered_death); filp->private_data = proc; - binder_unlock(__func__); - mutex_lock(&binder_procs_lock); hlist_add_head(&proc->proc_node, &binder_procs); mutex_unlock(&binder_procs_lock); @@ -4656,7 +4627,6 @@ static void binder_deferred_func(struct work_struct *work) int defer; do { - binder_lock(__func__); mutex_lock(&binder_deferred_lock); if (!hlist_empty(&binder_deferred_list)) { proc = hlist_entry(binder_deferred_list.first, @@ -4683,7 +4653,6 @@ static void binder_deferred_func(struct work_struct *work) if (defer & BINDER_DEFERRED_RELEASE) binder_deferred_release(proc); /* frees proc */ - binder_unlock(__func__); if (files) put_files_struct(files); } while (proc); @@ -5098,8 +5067,6 @@ static int binder_state_show(struct seq_file *m, void *unused) struct binder_node *node; struct binder_node *last_node = NULL; - binder_lock(__func__); - seq_puts(m, "binder state:\n"); spin_lock(&binder_dead_nodes_lock); @@ -5129,7 +5096,7 @@ static int binder_state_show(struct seq_file *m, void *unused) hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, 1); mutex_unlock(&binder_procs_lock); - binder_unlock(__func__); + return 0; } @@ -5137,8 +5104,6 @@ static int binder_stats_show(struct seq_file *m, void *unused) { struct binder_proc *proc; - binder_lock(__func__); - seq_puts(m, "binder stats:\n"); print_binder_stats(m, "", &binder_stats); @@ -5147,7 +5112,7 @@ static int binder_stats_show(struct seq_file *m, void *unused) hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc_stats(m, proc); mutex_unlock(&binder_procs_lock); - binder_unlock(__func__); + return 0; } @@ -5155,14 +5120,12 @@ static int binder_transactions_show(struct seq_file *m, void *unused) { struct binder_proc *proc; - binder_lock(__func__); - seq_puts(m, "binder transactions:\n"); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, 0); mutex_unlock(&binder_procs_lock); - binder_unlock(__func__); + return 0; } @@ -5171,8 +5134,6 @@ static int binder_proc_show(struct seq_file *m, void *unused) struct binder_proc *itr; int pid = (unsigned long)m->private; - binder_lock(__func__); - mutex_lock(&binder_procs_lock); hlist_for_each_entry(itr, &binder_procs, proc_node) { if (itr->pid == pid) { @@ -5182,7 +5143,6 @@ static int binder_proc_show(struct seq_file *m, void *unused) } mutex_unlock(&binder_procs_lock); - binder_unlock(__func__); return 0; } From c9cd6356f9631c9489b850c3e04d15071899fa6b Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 2 Jun 2017 11:15:44 -0700 Subject: [PATCH 0935/3220] ANDROID: binder: remove proc waitqueue Removes the process waitqueue, so that threads can only wait on the thread waitqueue. Whenever there is process work to do, pick a thread and wake it up. This also fixes an issue with using epoll(), since we no longer have to block on different waitqueues. Bug: 34461621 Change-Id: I2950b9de6fa078ee72d53c667a03cbaf587f0849 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 257 +++++++++++++++++++++++++++------------ 1 file changed, 182 insertions(+), 75 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 5753665f76ee..05185b7b1343 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -28,10 +28,10 @@ * binder_node_lock() and binder_node_unlock() are * used to acq/rel * 3) proc->inner_lock : protects the thread and node lists - * (proc->threads, proc->nodes) and all todo lists associated - * with the binder_proc (proc->todo, thread->todo, - * proc->delivered_death and node->async_todo), as well as - * thread->transaction_stack + * (proc->threads, proc->waiting_threads, proc->nodes) + * and all todo lists associated with the binder_proc + * (proc->todo, thread->todo, proc->delivered_death and + * node->async_todo), as well as thread->transaction_stack * binder_inner_proc_lock() and binder_inner_proc_unlock() * are used to acq/rel * @@ -477,6 +477,8 @@ enum binder_deferred_state { * (protected by @outer_lock) * @refs_by_node: rbtree of refs ordered by ref->node * (protected by @outer_lock) + * @waiting_threads: threads currently waiting for proc work + * (protected by @inner_lock) * @pid PID of group_leader of process * (invariant after initialized) * @tsk task_struct for group_leader of process @@ -506,8 +508,6 @@ enum binder_deferred_state { * (protected by @inner_lock) * @requested_threads_started: number binder threads started * (protected by @inner_lock) - * @ready_threads: number of threads waiting for proc work - * (protected by @inner_lock) * @tmp_ref: temporary reference to indicate proc is in use * (protected by @inner_lock) * @default_priority: default scheduler priority @@ -528,6 +528,7 @@ struct binder_proc { struct rb_root nodes; struct rb_root refs_by_desc; struct rb_root refs_by_node; + struct list_head waiting_threads; int pid; struct task_struct *tsk; struct files_struct *files; @@ -542,7 +543,6 @@ struct binder_proc { int max_threads; int requested_threads; int requested_threads_started; - int ready_threads; int tmp_ref; long default_priority; struct dentry *debugfs_entry; @@ -558,6 +558,7 @@ enum { BINDER_LOOPER_STATE_EXITED = 0x04, BINDER_LOOPER_STATE_INVALID = 0x08, BINDER_LOOPER_STATE_WAITING = 0x10, + BINDER_LOOPER_STATE_POLL = 0x20, }; /** @@ -566,6 +567,8 @@ enum { * (invariant after initialization) * @rb_node: element for proc->threads rbtree * (protected by @proc->inner_lock) + * @waiting_thread_node: element for @proc->waiting_threads list + * (protected by @proc->inner_lock) * @pid: PID for this thread * (invariant after initialization) * @looper: bitmap of looping state @@ -595,6 +598,7 @@ enum { struct binder_thread { struct binder_proc *proc; struct rb_node rb_node; + struct list_head waiting_thread_node; int pid; int looper; /* only modified by this thread */ bool looper_need_return; /* can be written by other thread */ @@ -922,6 +926,86 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd) return retval; } +static bool binder_has_work_ilocked(struct binder_thread *thread, + bool do_proc_work) +{ + return !binder_worklist_empty_ilocked(&thread->todo) || + thread->looper_need_return || + (do_proc_work && + !binder_worklist_empty_ilocked(&thread->proc->todo)); +} + +static bool binder_has_work(struct binder_thread *thread, bool do_proc_work) +{ + bool has_work; + + binder_inner_proc_lock(thread->proc); + has_work = binder_has_work_ilocked(thread, do_proc_work); + binder_inner_proc_unlock(thread->proc); + + return has_work; +} + +static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread) +{ + return !thread->transaction_stack && + binder_worklist_empty_ilocked(&thread->todo) && + (thread->looper & (BINDER_LOOPER_STATE_ENTERED | + BINDER_LOOPER_STATE_REGISTERED)); +} + +static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc, + bool sync) +{ + struct rb_node *n; + struct binder_thread *thread; + + for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { + thread = rb_entry(n, struct binder_thread, rb_node); + if (thread->looper & BINDER_LOOPER_STATE_POLL && + binder_available_for_proc_work_ilocked(thread)) { + if (sync) + wake_up_interruptible_sync(&thread->wait); + else + wake_up_interruptible(&thread->wait); + } + } +} + +static void binder_wakeup_proc_ilocked(struct binder_proc *proc, bool sync) +{ + struct binder_thread *thread; + + BUG_ON(!spin_is_locked(&proc->inner_lock)); + thread = list_first_entry_or_null(&proc->waiting_threads, + struct binder_thread, + waiting_thread_node); + + if (thread) { + list_del_init(&thread->waiting_thread_node); + if (sync) + wake_up_interruptible_sync(&thread->wait); + else + wake_up_interruptible(&thread->wait); + return; + } + + /* Didn't find a thread waiting for proc work; this can happen + * in two scenarios: + * 1. All threads are busy handling transactions + * In that case, one of those threads should call back into + * the kernel driver soon and pick up this work. + * 2. Threads are using the (e)poll interface, in which case + * they may be blocked on the waitqueue without having been + * added to waiting_threads. For this case, we just iterate + * over all threads not handling transaction work, and + * wake them all up. We wake all because we don't know whether + * a thread that called into (e)poll is handling non-binder + * work currently. + */ + binder_wakeup_poll_threads_ilocked(proc, sync); +} + static void binder_set_nice(long nice) { long min_nice; @@ -1141,7 +1225,7 @@ static bool binder_dec_node_nilocked(struct binder_node *node, if (proc && (node->has_strong_ref || node->has_weak_ref)) { if (list_empty(&node->work.entry)) { binder_enqueue_work_ilocked(&node->work, &proc->todo); - wake_up_interruptible(&node->proc->wait); + binder_wakeup_proc_ilocked(proc, false); } } else { if (hlist_empty(&node->refs) && !node->local_strong_refs && @@ -2402,7 +2486,6 @@ static void binder_transaction(struct binder_proc *proc, struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; struct list_head *target_list; - wait_queue_head_t *target_wait; struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; uint32_t return_error = 0; @@ -2412,6 +2495,7 @@ static void binder_transaction(struct binder_proc *proc, binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; int t_debug_id = atomic_inc_return(&binder_last_id); + bool wakeup_for_proc_work = false; e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; @@ -2575,10 +2659,9 @@ static void binder_transaction(struct binder_proc *proc, if (target_thread) { e->to_thread = target_thread->pid; target_list = &target_thread->todo; - target_wait = &target_thread->wait; } else { target_list = &target_proc->todo; - target_wait = &target_proc->wait; + wakeup_for_proc_work = true; } e->to_proc = target_proc->pid; @@ -2885,7 +2968,7 @@ static void binder_transaction(struct binder_proc *proc, binder_node_lock(target_node); if (target_node->has_async_transaction) { target_list = &target_node->async_todo; - target_wait = NULL; + wakeup_for_proc_work = false; } else target_node->has_async_transaction = 1; /* @@ -2904,11 +2987,13 @@ static void binder_transaction(struct binder_proc *proc, binder_inner_proc_unlock(target_proc); binder_node_unlock(target_node); } - if (target_wait) { - if (reply || !(tr->flags & TF_ONE_WAY)) - wake_up_interruptible_sync(target_wait); - else - wake_up_interruptible(target_wait); + if (target_thread) { + wake_up_interruptible_sync(&target_thread->wait); + } else if (wakeup_for_proc_work) { + binder_inner_proc_lock(target_proc); + binder_wakeup_proc_ilocked(target_proc, + !(tr->flags & TF_ONE_WAY)); + binder_inner_proc_unlock(target_proc); } if (target_thread) binder_thread_dec_tmpref(target_thread); @@ -3348,12 +3433,14 @@ static int binder_thread_write(struct binder_proc *proc, &ref->death->work, &thread->todo); else { - binder_enqueue_work( - proc, + binder_inner_proc_lock(proc); + binder_enqueue_work_ilocked( &ref->death->work, &proc->todo); - wake_up_interruptible( - &proc->wait); + binder_wakeup_proc_ilocked( + proc, + false); + binder_inner_proc_unlock(proc); } } } else { @@ -3388,8 +3475,9 @@ static int binder_thread_write(struct binder_proc *proc, binder_enqueue_work_ilocked( &death->work, &proc->todo); - wake_up_interruptible( - &proc->wait); + binder_wakeup_proc_ilocked( + proc, + false); } } else { BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER); @@ -3444,7 +3532,8 @@ static int binder_thread_write(struct binder_proc *proc, binder_enqueue_work_ilocked( &death->work, &proc->todo); - wake_up_interruptible(&proc->wait); + binder_wakeup_proc_ilocked( + proc, false); } } binder_inner_proc_unlock(proc); @@ -3471,13 +3560,6 @@ static void binder_stat_br(struct binder_proc *proc, } } -static int binder_has_proc_work(struct binder_proc *proc, - struct binder_thread *thread) -{ - return !binder_worklist_empty(proc, &proc->todo) || - thread->looper_need_return; -} - static int binder_has_thread_work(struct binder_thread *thread) { return !binder_worklist_empty(thread->proc, &thread->todo) || @@ -3515,6 +3597,38 @@ static int binder_put_node_cmd(struct binder_proc *proc, return 0; } +static int binder_wait_for_work(struct binder_thread *thread, + bool do_proc_work) +{ + DEFINE_WAIT(wait); + struct binder_proc *proc = thread->proc; + int ret = 0; + + freezer_do_not_count(); + binder_inner_proc_lock(proc); + for (;;) { + prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE); + if (binder_has_work_ilocked(thread, do_proc_work)) + break; + if (do_proc_work) + list_add(&thread->waiting_thread_node, + &proc->waiting_threads); + binder_inner_proc_unlock(proc); + schedule(); + binder_inner_proc_lock(proc); + list_del_init(&thread->waiting_thread_node); + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + } + finish_wait(&thread->wait, &wait); + binder_inner_proc_unlock(proc); + freezer_count(); + + return ret; +} + static int binder_thread_read(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, @@ -3535,10 +3649,7 @@ static int binder_thread_read(struct binder_proc *proc, retry: binder_inner_proc_lock(proc); - wait_for_proc_work = thread->transaction_stack == NULL && - binder_worklist_empty_ilocked(&thread->todo); - if (wait_for_proc_work) - proc->ready_threads++; + wait_for_proc_work = binder_available_for_proc_work_ilocked(thread); binder_inner_proc_unlock(proc); thread->looper |= BINDER_LOOPER_STATE_WAITING; @@ -3555,23 +3666,15 @@ retry: binder_stop_on_user_error < 2); } binder_set_nice(proc->default_priority); - if (non_block) { - if (!binder_has_proc_work(proc, thread)) - ret = -EAGAIN; - } else - ret = wait_event_freezable_exclusive(proc->wait, binder_has_proc_work(proc, thread)); - } else { - if (non_block) { - if (!binder_has_thread_work(thread)) - ret = -EAGAIN; - } else - ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread)); } - binder_inner_proc_lock(proc); - if (wait_for_proc_work) - proc->ready_threads--; - binder_inner_proc_unlock(proc); + if (non_block) { + if (!binder_has_work(thread, wait_for_proc_work)) + ret = -EAGAIN; + } else { + ret = binder_wait_for_work(thread, wait_for_proc_work); + } + thread->looper &= ~BINDER_LOOPER_STATE_WAITING; if (ret) @@ -3857,7 +3960,8 @@ done: *consumed = ptr - buffer; binder_inner_proc_lock(proc); - if (proc->requested_threads + proc->ready_threads == 0 && + if (proc->requested_threads == 0 && + list_empty(&thread->proc->waiting_threads) && proc->requested_threads_started < proc->max_threads && (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */ @@ -3968,7 +4072,7 @@ static struct binder_thread *binder_get_thread_ilocked( thread->return_error.cmd = BR_OK; thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR; thread->reply_error.cmd = BR_OK; - + INIT_LIST_HEAD(&new_thread->waiting_thread_node); return thread; } @@ -4081,28 +4185,24 @@ static unsigned int binder_poll(struct file *filp, { struct binder_proc *proc = filp->private_data; struct binder_thread *thread = NULL; - int wait_for_proc_work; + bool wait_for_proc_work; thread = binder_get_thread(proc); binder_inner_proc_lock(thread->proc); - wait_for_proc_work = thread->transaction_stack == NULL && - binder_worklist_empty_ilocked(&thread->todo); + thread->looper |= BINDER_LOOPER_STATE_POLL; + wait_for_proc_work = binder_available_for_proc_work_ilocked(thread); + binder_inner_proc_unlock(thread->proc); - if (wait_for_proc_work) { - if (binder_has_proc_work(proc, thread)) - return POLLIN; - poll_wait(filp, &proc->wait, wait); - if (binder_has_proc_work(proc, thread)) - return POLLIN; - } else { - if (binder_has_thread_work(thread)) - return POLLIN; - poll_wait(filp, &thread->wait, wait); - if (binder_has_thread_work(thread)) - return POLLIN; - } + if (binder_has_work(thread, wait_for_proc_work)) + return POLLIN; + + poll_wait(filp, &thread->wait, wait); + + if (binder_has_thread_work(thread)) + return POLLIN; + return 0; } @@ -4149,8 +4249,10 @@ static int binder_ioctl_write_read(struct file *filp, &bwr.read_consumed, filp->f_flags & O_NONBLOCK); trace_binder_read_done(ret); - if (!binder_worklist_empty(proc, &proc->todo)) - wake_up_interruptible(&proc->wait); + binder_inner_proc_lock(proc); + if (!binder_worklist_empty_ilocked(&proc->todo)) + binder_wakeup_proc_ilocked(proc, false); + binder_inner_proc_unlock(proc); if (ret < 0) { if (copy_to_user(ubuf, &bwr, sizeof(bwr))) ret = -EFAULT; @@ -4390,7 +4492,6 @@ static int binder_open(struct inode *nodp, struct file *filp) get_task_struct(current->group_leader); proc->tsk = current->group_leader; INIT_LIST_HEAD(&proc->todo); - init_waitqueue_head(&proc->wait); proc->default_priority = task_nice(current); binder_dev = container_of(filp->private_data, struct binder_device, miscdev); @@ -4400,6 +4501,7 @@ static int binder_open(struct inode *nodp, struct file *filp) binder_stats_created(BINDER_STAT_PROC); proc->pid = current->group_leader->pid; INIT_LIST_HEAD(&proc->delivered_death); + INIT_LIST_HEAD(&proc->waiting_threads); filp->private_data = proc; mutex_lock(&binder_procs_lock); @@ -4451,7 +4553,6 @@ static void binder_deferred_flush(struct binder_proc *proc) } } binder_inner_proc_unlock(proc); - wake_up_interruptible_all(&proc->wait); binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_flush: %d woke %d threads\n", proc->pid, @@ -4520,7 +4621,7 @@ static int binder_node_release(struct binder_node *node, int refs) ref->death->work.type = BINDER_WORK_DEAD_BINDER; binder_enqueue_work_ilocked(&ref->death->work, &ref->proc->todo); - wake_up_interruptible(&ref->proc->wait); + binder_wakeup_proc_ilocked(ref->proc, false); binder_inner_proc_unlock(ref->proc); } @@ -5008,23 +5109,29 @@ static void print_binder_proc_stats(struct seq_file *m, struct binder_proc *proc) { struct binder_work *w; + struct binder_thread *thread; struct rb_node *n; - int count, strong, weak; + int count, strong, weak, ready_threads; size_t free_async_space = binder_alloc_get_free_async_space(&proc->alloc); seq_printf(m, "proc %d\n", proc->pid); seq_printf(m, "context %s\n", proc->context->name); count = 0; + ready_threads = 0; binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) count++; + + list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node) + ready_threads++; + seq_printf(m, " threads: %d\n", count); seq_printf(m, " requested threads: %d+%d/%d\n" " ready threads %d\n" " free async space %zd\n", proc->requested_threads, proc->requested_threads_started, proc->max_threads, - proc->ready_threads, + ready_threads, free_async_space); count = 0; for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) From 5347bf52735ec797553aa11189f03742f4015606 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Tue, 6 Jun 2017 15:17:46 -0700 Subject: [PATCH 0936/3220] ANDROID: binder: push new transactions to waiting threads. Instead of pushing new transactions to the process waitqueue, select a thread that is waiting on proc work to handle the transaction. This will make it easier to improve priority inheritance in future patches, by setting the priority before we wake up a thread. If we can't find a waiting thread, submit the work to the proc waitqueue instead as we did previously. Change-Id: I23cbfcca867bed7b86007e22137d0a8fad4b4001 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 181 +++++++++++++++++++++++++++------------ 1 file changed, 127 insertions(+), 54 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 05185b7b1343..e2daf6b7cbd0 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -972,7 +972,20 @@ static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc, } } -static void binder_wakeup_proc_ilocked(struct binder_proc *proc, bool sync) +/** + * binder_select_thread_ilocked() - selects a thread for doing proc work. + * @proc: process to select a thread from + * + * Note that calling this function moves the thread off the waiting_threads + * list, so it can only be woken up by the caller of this function, or a + * signal. Therefore, callers *should* always wake up the thread this function + * returns. + * + * Return: If there's a thread currently waiting for process work, + * returns that thread. Otherwise returns NULL. + */ +static struct binder_thread * +binder_select_thread_ilocked(struct binder_proc *proc) { struct binder_thread *thread; @@ -981,8 +994,35 @@ static void binder_wakeup_proc_ilocked(struct binder_proc *proc, bool sync) struct binder_thread, waiting_thread_node); - if (thread) { + if (thread) list_del_init(&thread->waiting_thread_node); + + return thread; +} + +/** + * binder_wakeup_thread_ilocked() - wakes up a thread for doing proc work. + * @proc: process to wake up a thread in + * @thread: specific thread to wake-up (may be NULL) + * @sync: whether to do a synchronous wake-up + * + * This function wakes up a thread in the @proc process. + * The caller may provide a specific thread to wake-up in + * the @thread parameter. If @thread is NULL, this function + * will wake up threads that have called poll(). + * + * Note that for this function to work as expected, callers + * should first call binder_select_thread() to find a thread + * to handle the work (if they don't have a thread already), + * and pass the result into the @thread parameter. + */ +static void binder_wakeup_thread_ilocked(struct binder_proc *proc, + struct binder_thread *thread, + bool sync) +{ + BUG_ON(!spin_is_locked(&proc->inner_lock)); + + if (thread) { if (sync) wake_up_interruptible_sync(&thread->wait); else @@ -1006,6 +1046,13 @@ static void binder_wakeup_proc_ilocked(struct binder_proc *proc, bool sync) binder_wakeup_poll_threads_ilocked(proc, sync); } +static void binder_wakeup_proc_ilocked(struct binder_proc *proc) +{ + struct binder_thread *thread = binder_select_thread_ilocked(proc); + + binder_wakeup_thread_ilocked(proc, thread, /* sync = */false); +} + static void binder_set_nice(long nice) { long min_nice; @@ -1225,7 +1272,7 @@ static bool binder_dec_node_nilocked(struct binder_node *node, if (proc && (node->has_strong_ref || node->has_weak_ref)) { if (list_empty(&node->work.entry)) { binder_enqueue_work_ilocked(&node->work, &proc->todo); - binder_wakeup_proc_ilocked(proc, false); + binder_wakeup_proc_ilocked(proc); } } else { if (hlist_empty(&node->refs) && !node->local_strong_refs && @@ -2471,6 +2518,73 @@ static int binder_fixup_parent(struct binder_transaction *t, return 0; } +/** + * binder_proc_transaction() - sends a transaction to a process and wakes it up + * @t: transaction to send + * @proc: process to send the transaction to + * @thread: thread in @proc to send the transaction to (may be NULL) + * + * This function queues a transaction to the specified process. It will try + * to find a thread in the target process to handle the transaction and + * wake it up. If no thread is found, the work is queued to the proc + * waitqueue. + * + * If the @thread parameter is not NULL, the transaction is always queued + * to the waitlist of that specific thread. + * + * Return: true if the transactions was successfully queued + * false if the target process or thread is dead + */ +static bool binder_proc_transaction(struct binder_transaction *t, + struct binder_proc *proc, + struct binder_thread *thread) +{ + struct list_head *target_list = NULL; + struct binder_node *node = t->buffer->target_node; + bool oneway = !!(t->flags & TF_ONE_WAY); + bool wakeup = true; + + BUG_ON(!node); + binder_node_lock(node); + if (oneway) { + BUG_ON(thread); + if (node->has_async_transaction) { + target_list = &node->async_todo; + wakeup = false; + } else { + node->has_async_transaction = 1; + } + } + + binder_inner_proc_lock(proc); + + if (proc->is_dead || (thread && thread->is_dead)) { + binder_inner_proc_unlock(proc); + binder_node_unlock(node); + return false; + } + + if (!thread && !target_list) + thread = binder_select_thread_ilocked(proc); + + if (thread) + target_list = &thread->todo; + else if (!target_list) + target_list = &proc->todo; + else + BUG_ON(target_list != &node->async_todo); + + binder_enqueue_work_ilocked(&t->work, target_list); + + if (wakeup) + binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */); + + binder_inner_proc_unlock(proc); + binder_node_unlock(node); + + return true; +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, @@ -2485,7 +2599,6 @@ static void binder_transaction(struct binder_proc *proc, struct binder_proc *target_proc = NULL; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; - struct list_head *target_list; struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; uint32_t return_error = 0; @@ -2495,7 +2608,6 @@ static void binder_transaction(struct binder_proc *proc, binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; int t_debug_id = atomic_inc_return(&binder_last_id); - bool wakeup_for_proc_work = false; e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; @@ -2656,13 +2768,8 @@ static void binder_transaction(struct binder_proc *proc, } binder_inner_proc_unlock(proc); } - if (target_thread) { + if (target_thread) e->to_thread = target_thread->pid; - target_list = &target_thread->todo; - } else { - target_list = &target_proc->todo; - wakeup_for_proc_work = true; - } e->to_proc = target_proc->pid; /* TODO: reuse incoming transaction for reply */ @@ -2941,8 +3048,9 @@ static void binder_transaction(struct binder_proc *proc, } BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction_ilocked(target_thread, in_reply_to); - binder_enqueue_work_ilocked(&t->work, target_list); + binder_enqueue_work_ilocked(&t->work, &target_thread->todo); binder_inner_proc_unlock(target_proc); + wake_up_interruptible_sync(&target_thread->wait); binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); @@ -2951,49 +3059,17 @@ static void binder_transaction(struct binder_proc *proc, t->from_parent = thread->transaction_stack; thread->transaction_stack = t; binder_inner_proc_unlock(proc); - binder_inner_proc_lock(target_proc); - if (target_proc->is_dead || - (target_thread && target_thread->is_dead)) { - binder_inner_proc_unlock(target_proc); + if (!binder_proc_transaction(t, target_proc, target_thread)) { binder_inner_proc_lock(proc); binder_pop_transaction_ilocked(thread, t); binder_inner_proc_unlock(proc); goto err_dead_proc_or_thread; } - binder_enqueue_work_ilocked(&t->work, target_list); - binder_inner_proc_unlock(target_proc); } else { BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); - binder_node_lock(target_node); - if (target_node->has_async_transaction) { - target_list = &target_node->async_todo; - wakeup_for_proc_work = false; - } else - target_node->has_async_transaction = 1; - /* - * Test/set of has_async_transaction - * must be atomic with enqueue on - * async_todo - */ - binder_inner_proc_lock(target_proc); - if (target_proc->is_dead || - (target_thread && target_thread->is_dead)) { - binder_inner_proc_unlock(target_proc); - binder_node_unlock(target_node); + if (!binder_proc_transaction(t, target_proc, NULL)) goto err_dead_proc_or_thread; - } - binder_enqueue_work_ilocked(&t->work, target_list); - binder_inner_proc_unlock(target_proc); - binder_node_unlock(target_node); - } - if (target_thread) { - wake_up_interruptible_sync(&target_thread->wait); - } else if (wakeup_for_proc_work) { - binder_inner_proc_lock(target_proc); - binder_wakeup_proc_ilocked(target_proc, - !(tr->flags & TF_ONE_WAY)); - binder_inner_proc_unlock(target_proc); } if (target_thread) binder_thread_dec_tmpref(target_thread); @@ -3438,8 +3514,7 @@ static int binder_thread_write(struct binder_proc *proc, &ref->death->work, &proc->todo); binder_wakeup_proc_ilocked( - proc, - false); + proc); binder_inner_proc_unlock(proc); } } @@ -3476,8 +3551,7 @@ static int binder_thread_write(struct binder_proc *proc, &death->work, &proc->todo); binder_wakeup_proc_ilocked( - proc, - false); + proc); } } else { BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER); @@ -3532,8 +3606,7 @@ static int binder_thread_write(struct binder_proc *proc, binder_enqueue_work_ilocked( &death->work, &proc->todo); - binder_wakeup_proc_ilocked( - proc, false); + binder_wakeup_proc_ilocked(proc); } } binder_inner_proc_unlock(proc); @@ -4251,7 +4324,7 @@ static int binder_ioctl_write_read(struct file *filp, trace_binder_read_done(ret); binder_inner_proc_lock(proc); if (!binder_worklist_empty_ilocked(&proc->todo)) - binder_wakeup_proc_ilocked(proc, false); + binder_wakeup_proc_ilocked(proc); binder_inner_proc_unlock(proc); if (ret < 0) { if (copy_to_user(ubuf, &bwr, sizeof(bwr))) @@ -4621,7 +4694,7 @@ static int binder_node_release(struct binder_node *node, int refs) ref->death->work.type = BINDER_WORK_DEAD_BINDER; binder_enqueue_work_ilocked(&ref->death->work, &ref->proc->todo); - binder_wakeup_proc_ilocked(ref->proc, false); + binder_wakeup_proc_ilocked(ref->proc); binder_inner_proc_unlock(ref->proc); } From d30e6a877a84916efb17836566aeecf2ce0d6e9f Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Tue, 6 Jun 2017 17:04:42 -0700 Subject: [PATCH 0937/3220] ANDROID: binder: add support for RT prio inheritance. Adds support for SCHED_BATCH/SCHED_FIFO/SCHED_RR priority inheritance. Change-Id: I71f356e476be2933713a0ecfa2cc31aa141e2dc6 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 163 ++++++++++++++++++++++++++++++++------- 1 file changed, 134 insertions(+), 29 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index e2daf6b7cbd0..85a4651361b6 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -465,6 +465,22 @@ enum binder_deferred_state { BINDER_DEFERRED_RELEASE = 0x04, }; +/** + * struct binder_priority - scheduler policy and priority + * @sched_policy scheduler policy + * @prio [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT + * + * The binder driver supports inheriting the following scheduler policies: + * SCHED_NORMAL + * SCHED_BATCH + * SCHED_FIFO + * SCHED_RR + */ +struct binder_priority { + unsigned int sched_policy; + int prio; +}; + /** * struct binder_proc - binder process bookkeeping * @proc_node: element for binder_procs list @@ -544,7 +560,7 @@ struct binder_proc { int requested_threads; int requested_threads_started; int tmp_ref; - long default_priority; + struct binder_priority default_priority; struct dentry *debugfs_entry; struct binder_alloc alloc; struct binder_context *context; @@ -626,8 +642,8 @@ struct binder_transaction { struct binder_buffer *buffer; unsigned int code; unsigned int flags; - long priority; - long saved_priority; + struct binder_priority priority; + struct binder_priority saved_priority; kuid_t sender_euid; /** * @lock: protects @from, @to_proc, and @to_thread @@ -1053,22 +1069,93 @@ static void binder_wakeup_proc_ilocked(struct binder_proc *proc) binder_wakeup_thread_ilocked(proc, thread, /* sync = */false); } -static void binder_set_nice(long nice) +static bool is_rt_policy(int policy) { - long min_nice; + return policy == SCHED_FIFO || policy == SCHED_RR; +} - if (can_nice(current, nice)) { - set_user_nice(current, nice); +static bool is_fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + +static bool binder_supported_policy(int policy) +{ + return is_fair_policy(policy) || is_rt_policy(policy); +} + +static int to_userspace_prio(int policy, int kernel_priority) +{ + if (is_fair_policy(policy)) + return PRIO_TO_NICE(kernel_priority); + else + return MAX_USER_RT_PRIO - 1 - kernel_priority; +} + +static int to_kernel_prio(int policy, int user_priority) +{ + if (is_fair_policy(policy)) + return NICE_TO_PRIO(user_priority); + else + return MAX_USER_RT_PRIO - 1 - user_priority; +} + +static void binder_set_priority(struct task_struct *task, + struct binder_priority desired) +{ + int priority; /* user-space prio value */ + bool has_cap_nice; + unsigned int policy = desired.sched_policy; + + if (task->policy == policy && task->normal_prio == desired.prio) return; + + has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE); + + priority = to_userspace_prio(policy, desired.prio); + + if (is_rt_policy(policy) && !has_cap_nice) { + long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO); + + if (max_rtprio == 0) { + policy = SCHED_NORMAL; + priority = MIN_NICE; + } else if (priority > max_rtprio) { + priority = max_rtprio; + } } - min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur); - binder_debug(BINDER_DEBUG_PRIORITY_CAP, - "%d: nice value %ld not allowed use %ld instead\n", - current->pid, nice, min_nice); - set_user_nice(current, min_nice); - if (min_nice <= MAX_NICE) - return; - binder_user_error("%d RLIMIT_NICE not set\n", current->pid); + + if (is_fair_policy(policy) && !has_cap_nice) { + long min_nice = rlimit_to_nice(task_rlimit(task, RLIMIT_NICE)); + + if (min_nice > MAX_NICE) { + binder_user_error("%d RLIMIT_NICE not set\n", + task->pid); + return; + } else if (priority < min_nice) { + priority = min_nice; + } + } + + if (policy != desired.sched_policy || + to_kernel_prio(policy, priority) != desired.prio) + binder_debug(BINDER_DEBUG_PRIORITY_CAP, + "%d: priority %d not allowed, using %d instead\n", + task->pid, desired.prio, + to_kernel_prio(policy, priority)); + + /* Set the actual priority */ + if (task->policy != policy || is_rt_policy(policy)) { + struct sched_param params; + + params.sched_priority = is_rt_policy(policy) ? priority : 0; + + sched_setscheduler_nocheck(task, + policy | SCHED_RESET_ON_FORK, + ¶ms); + } + if (is_fair_policy(policy)) + set_user_nice(task, priority); } static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, @@ -1152,7 +1239,8 @@ static struct binder_node *binder_init_node_ilocked( node->ptr = ptr; node->cookie = cookie; node->work.type = BINDER_WORK_NODE; - node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; + node->min_priority = NICE_TO_PRIO( + flags & FLAT_BINDER_FLAG_PRIORITY_MASK); node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); @@ -2649,7 +2737,7 @@ static void binder_transaction(struct binder_proc *proc, } thread->transaction_stack = in_reply_to->to_parent; binder_inner_proc_unlock(proc); - binder_set_nice(in_reply_to->saved_priority); + binder_set_priority(current, in_reply_to->saved_priority); target_thread = binder_get_txn_from_and_acq_inner(in_reply_to); if (target_thread == NULL) { return_error = BR_DEAD_REPLY; @@ -2822,7 +2910,15 @@ static void binder_transaction(struct binder_proc *proc, t->to_thread = target_thread; t->code = tr->code; t->flags = tr->flags; - t->priority = task_nice(current); + if (!(t->flags & TF_ONE_WAY) && + binder_supported_policy(current->policy)) { + /* Inherit supported policies for synchronous transactions */ + t->priority.sched_policy = current->policy; + t->priority.prio = current->normal_prio; + } else { + /* Otherwise, fall back to the default priority */ + t->priority = target_proc->default_priority; + } trace_binder_transaction(reply, t, target_node); @@ -3738,7 +3834,7 @@ retry: wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); } - binder_set_nice(proc->default_priority); + binder_set_priority(current, proc->default_priority); } if (non_block) { @@ -3950,16 +4046,17 @@ retry: BUG_ON(t->buffer == NULL); if (t->buffer->target_node) { struct binder_node *target_node = t->buffer->target_node; + struct binder_priority prio = t->priority; tr.target.ptr = target_node->ptr; tr.cookie = target_node->cookie; - t->saved_priority = task_nice(current); - if (t->priority < target_node->min_priority && - !(t->flags & TF_ONE_WAY)) - binder_set_nice(t->priority); - else if (!(t->flags & TF_ONE_WAY) || - t->saved_priority > target_node->min_priority) - binder_set_nice(target_node->min_priority); + t->saved_priority.sched_policy = current->policy; + t->saved_priority.prio = current->normal_prio; + if (target_node->min_priority < t->priority.prio) { + prio.sched_policy = SCHED_NORMAL; + prio.prio = target_node->min_priority; + } + binder_set_priority(current, prio); cmd = BR_TRANSACTION; } else { tr.target.ptr = 0; @@ -4565,7 +4662,14 @@ static int binder_open(struct inode *nodp, struct file *filp) get_task_struct(current->group_leader); proc->tsk = current->group_leader; INIT_LIST_HEAD(&proc->todo); - proc->default_priority = task_nice(current); + if (binder_supported_policy(current->policy)) { + proc->default_priority.sched_policy = current->policy; + proc->default_priority.prio = current->normal_prio; + } else { + proc->default_priority.sched_policy = SCHED_NORMAL; + proc->default_priority.prio = NICE_TO_PRIO(0); + } + binder_dev = container_of(filp->private_data, struct binder_device, miscdev); proc->context = &binder_dev->context; @@ -4858,13 +4962,14 @@ static void print_binder_transaction_ilocked(struct seq_file *m, spin_lock(&t->lock); to_proc = t->to_proc; seq_printf(m, - "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d", + "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %d:%d r%d", prefix, t->debug_id, t, t->from ? t->from->proc->pid : 0, t->from ? t->from->pid : 0, to_proc ? to_proc->pid : 0, t->to_thread ? t->to_thread->pid : 0, - t->code, t->flags, t->priority, t->need_reply); + t->code, t->flags, t->priority.sched_policy, + t->priority.prio, t->need_reply); spin_unlock(&t->lock); if (proc != to_proc) { From adb685439e768b120a5bdae15f333967d1e239ed Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Wed, 7 Jun 2017 09:29:14 -0700 Subject: [PATCH 0938/3220] ANDROID: binder: add min sched_policy to node. This change adds flags to flat_binder_object.flags to allow indicating a minimum scheduling policy for the node. It also clarifies the valid value range for the priority bits in the flags. Internally, we use the priority map that the kernel uses, e.g. [0..99] for real-time policies and [100..139] for the SCHED_NORMAL/SCHED_BATCH policies. Bug: 34461621 Bug: 37293077 Change-Id: I12438deecb53df432da18c6fc77460768ae726d2 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 26 ++++++++++++++---- include/uapi/linux/android/binder.h | 41 ++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 6 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 85a4651361b6..838f4dbc9af5 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -353,6 +353,8 @@ struct binder_error { * and by @lock) * @has_async_transaction: async transaction to node in progress * (protected by @lock) + * @sched_policy: minimum scheduling policy for node + * (invariant after initialized) * @accept_fds: file descriptor operations supported for node * (invariant after initialized) * @min_priority: minimum scheduling priority @@ -392,6 +394,7 @@ struct binder_node { /* * invariant after initialization */ + u8 sched_policy:2; u8 accept_fds:1; u8 min_priority; }; @@ -1208,6 +1211,7 @@ static struct binder_node *binder_init_node_ilocked( binder_uintptr_t ptr = fp ? fp->binder : 0; binder_uintptr_t cookie = fp ? fp->cookie : 0; __u32 flags = fp ? fp->flags : 0; + s8 priority; BUG_ON(!spin_is_locked(&proc->inner_lock)); while (*p) { @@ -1239,8 +1243,10 @@ static struct binder_node *binder_init_node_ilocked( node->ptr = ptr; node->cookie = cookie; node->work.type = BINDER_WORK_NODE; - node->min_priority = NICE_TO_PRIO( - flags & FLAT_BINDER_FLAG_PRIORITY_MASK); + priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; + node->sched_policy = (flags & FLAT_BINDER_FLAG_PRIORITY_MASK) >> + FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT; + node->min_priority = to_kernel_prio(node->sched_policy, priority); node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); @@ -4052,8 +4058,17 @@ retry: tr.cookie = target_node->cookie; t->saved_priority.sched_policy = current->policy; t->saved_priority.prio = current->normal_prio; - if (target_node->min_priority < t->priority.prio) { - prio.sched_policy = SCHED_NORMAL; + if (target_node->min_priority < t->priority.prio || + (target_node->min_priority == t->priority.prio && + target_node->sched_policy == SCHED_FIFO)) { + /* + * In case the minimum priority on the node is + * higher (lower value), use that priority. If + * the priority is the same, but the node uses + * SCHED_FIFO, prefer SCHED_FIFO, since it can + * run unbounded, unlike SCHED_RR. + */ + prio.sched_policy = target_node->sched_policy; prio.prio = target_node->min_priority; } binder_set_priority(current, prio); @@ -5092,8 +5107,9 @@ static void print_binder_node_nilocked(struct seq_file *m, hlist_for_each_entry(ref, &node->refs, node_entry) count++; - seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d", + seq_printf(m, " node %d: u%016llx c%016llx pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d", node->debug_id, (u64)node->ptr, (u64)node->cookie, + node->sched_policy, node->min_priority, node->has_strong_ref, node->has_weak_ref, node->local_strong_refs, node->local_weak_refs, node->internal_strong_refs, count, node->tmp_refs); diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 7668b5791c91..026558ac254d 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -37,9 +37,48 @@ enum { BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE), }; -enum { +/** + * enum flat_binder_object_shifts: shift values for flat_binder_object_flags + * @FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT: shift for getting scheduler policy. + * + */ +enum flat_binder_object_shifts { + FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT = 9, +}; + +/** + * enum flat_binder_object_flags - flags for use in flat_binder_object.flags + */ +enum flat_binder_object_flags { + /** + * @FLAT_BINDER_FLAG_PRIORITY_MASK: bit-mask for min scheduler priority + * + * These bits can be used to set the minimum scheduler priority + * at which transactions into this node should run. Valid values + * in these bits depend on the scheduler policy encoded in + * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK. + * + * For SCHED_NORMAL/SCHED_BATCH, the valid range is between [-20..19] + * For SCHED_FIFO/SCHED_RR, the value can run between [1..99] + */ FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff, + /** + * @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds. + */ FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, + /** + * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy + * + * These two bits can be used to set the min scheduling policy at which + * transactions on this node should run. These match the UAPI + * scheduler policy values, eg: + * 00b: SCHED_NORMAL + * 01b: SCHED_FIFO + * 10b: SCHED_RR + * 11b: SCHED_BATCH + */ + FLAT_BINDER_FLAG_SCHED_POLICY_MASK = + 3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT, }; #ifdef BINDER_IPC_32BIT From 7230f99185e5200b959ea3c4387bbea24e30bd9b Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Wed, 7 Jun 2017 10:02:12 -0700 Subject: [PATCH 0939/3220] ANDROID: binder: improve priority inheritance. By raising the priority of a thread selected for a transaction *before* we wake it up. Delay restoring the priority when doing a reply until after we wake-up the process receiving the reply. Change-Id: Ic332e4e0ed7d2d3ca6ab1034da4629c9eadd3405 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 74 ++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 21 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 838f4dbc9af5..d046e7798b19 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -611,6 +611,7 @@ enum { * @is_dead: thread is dead and awaiting free * when outstanding transactions are cleaned up * (protected by @proc->inner_lock) + * @task: struct task_struct for this thread * * Bookkeeping structure for binder threads. */ @@ -629,6 +630,7 @@ struct binder_thread { struct binder_stats stats; atomic_t tmp_ref; bool is_dead; + struct task_struct *task; }; struct binder_transaction { @@ -647,6 +649,7 @@ struct binder_transaction { unsigned int flags; struct binder_priority priority; struct binder_priority saved_priority; + bool set_priority_called; kuid_t sender_euid; /** * @lock: protects @from, @to_proc, and @to_thread @@ -1161,6 +1164,38 @@ static void binder_set_priority(struct task_struct *task, set_user_nice(task, priority); } +static void binder_transaction_priority(struct task_struct *task, + struct binder_transaction *t, + struct binder_priority node_prio) +{ + struct binder_priority desired_prio; + + if (t->set_priority_called) + return; + + t->set_priority_called = true; + t->saved_priority.sched_policy = task->policy; + t->saved_priority.prio = task->normal_prio; + + desired_prio.prio = t->priority.prio; + desired_prio.sched_policy = t->priority.sched_policy; + + if (node_prio.prio < t->priority.prio || + (node_prio.prio == t->priority.prio && + node_prio.sched_policy == SCHED_FIFO)) { + /* + * In case the minimum priority on the node is + * higher (lower value), use that priority. If + * the priority is the same, but the node uses + * SCHED_FIFO, prefer SCHED_FIFO, since it can + * run unbounded, unlike SCHED_RR. + */ + desired_prio = node_prio; + } + + binder_set_priority(task, desired_prio); +} + static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, binder_uintptr_t ptr) { @@ -2635,11 +2670,15 @@ static bool binder_proc_transaction(struct binder_transaction *t, { struct list_head *target_list = NULL; struct binder_node *node = t->buffer->target_node; + struct binder_priority node_prio; bool oneway = !!(t->flags & TF_ONE_WAY); bool wakeup = true; BUG_ON(!node); binder_node_lock(node); + node_prio.prio = node->min_priority; + node_prio.sched_policy = node->sched_policy; + if (oneway) { BUG_ON(thread); if (node->has_async_transaction) { @@ -2661,12 +2700,14 @@ static bool binder_proc_transaction(struct binder_transaction *t, if (!thread && !target_list) thread = binder_select_thread_ilocked(proc); - if (thread) + if (thread) { target_list = &thread->todo; - else if (!target_list) + binder_transaction_priority(thread->task, t, node_prio); + } else if (!target_list) { target_list = &proc->todo; - else + } else { BUG_ON(target_list != &node->async_todo); + } binder_enqueue_work_ilocked(&t->work, target_list); @@ -2743,7 +2784,6 @@ static void binder_transaction(struct binder_proc *proc, } thread->transaction_stack = in_reply_to->to_parent; binder_inner_proc_unlock(proc); - binder_set_priority(current, in_reply_to->saved_priority); target_thread = binder_get_txn_from_and_acq_inner(in_reply_to); if (target_thread == NULL) { return_error = BR_DEAD_REPLY; @@ -3153,6 +3193,7 @@ static void binder_transaction(struct binder_proc *proc, binder_enqueue_work_ilocked(&t->work, &target_thread->todo); binder_inner_proc_unlock(target_proc); wake_up_interruptible_sync(&target_thread->wait); + binder_set_priority(current, in_reply_to->saved_priority); binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); @@ -3241,6 +3282,7 @@ err_no_context_mgr_node: BUG_ON(thread->return_error.cmd != BR_OK); if (in_reply_to) { + binder_set_priority(current, in_reply_to->saved_priority); thread->return_error.cmd = BR_TRANSACTION_COMPLETE; binder_enqueue_work(thread->proc, &thread->return_error.work, @@ -4052,26 +4094,13 @@ retry: BUG_ON(t->buffer == NULL); if (t->buffer->target_node) { struct binder_node *target_node = t->buffer->target_node; - struct binder_priority prio = t->priority; + struct binder_priority node_prio; tr.target.ptr = target_node->ptr; tr.cookie = target_node->cookie; - t->saved_priority.sched_policy = current->policy; - t->saved_priority.prio = current->normal_prio; - if (target_node->min_priority < t->priority.prio || - (target_node->min_priority == t->priority.prio && - target_node->sched_policy == SCHED_FIFO)) { - /* - * In case the minimum priority on the node is - * higher (lower value), use that priority. If - * the priority is the same, but the node uses - * SCHED_FIFO, prefer SCHED_FIFO, since it can - * run unbounded, unlike SCHED_RR. - */ - prio.sched_policy = target_node->sched_policy; - prio.prio = target_node->min_priority; - } - binder_set_priority(current, prio); + node_prio.sched_policy = target_node->sched_policy; + node_prio.prio = target_node->min_priority; + binder_transaction_priority(current, t, node_prio); cmd = BR_TRANSACTION; } else { tr.target.ptr = 0; @@ -4247,6 +4276,8 @@ static struct binder_thread *binder_get_thread_ilocked( binder_stats_created(BINDER_STAT_THREAD); thread->proc = proc; thread->pid = current->pid; + get_task_struct(current); + thread->task = current; atomic_set(&thread->tmp_ref, 0); init_waitqueue_head(&thread->wait); INIT_LIST_HEAD(&thread->todo); @@ -4297,6 +4328,7 @@ static void binder_free_thread(struct binder_thread *thread) BUG_ON(!list_empty(&thread->todo)); binder_stats_deleted(BINDER_STAT_THREAD); binder_proc_dec_tmpref(thread->proc); + put_task_struct(thread->task); kfree(thread); } From 39140a0f385c456a43c2dd25bc01503cc3841b97 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 23 Jun 2017 10:13:43 -0700 Subject: [PATCH 0940/3220] ANDROID: binder: add RT inheritance flag to node. Allows a binder node to specify whether it wants to inherit real-time scheduling policy from a caller. Change-Id: I375b6094bf441c19f19cba06d5a6be02cd07d714 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 22 +++++++++++++++++----- include/uapi/linux/android/binder.h | 8 ++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index d046e7798b19..6b24f70dbff8 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -359,6 +359,8 @@ struct binder_error { * (invariant after initialized) * @min_priority: minimum scheduling priority * (invariant after initialized) + * @inherit_rt: inherit RT scheduling policy from caller + * (invariant after initialized) * @async_todo: list of async work items * (protected by @proc->inner_lock) * @@ -395,6 +397,7 @@ struct binder_node { * invariant after initialization */ u8 sched_policy:2; + u8 inherit_rt:1; u8 accept_fds:1; u8 min_priority; }; @@ -1166,7 +1169,8 @@ static void binder_set_priority(struct task_struct *task, static void binder_transaction_priority(struct task_struct *task, struct binder_transaction *t, - struct binder_priority node_prio) + struct binder_priority node_prio, + bool inherit_rt) { struct binder_priority desired_prio; @@ -1177,8 +1181,13 @@ static void binder_transaction_priority(struct task_struct *task, t->saved_priority.sched_policy = task->policy; t->saved_priority.prio = task->normal_prio; - desired_prio.prio = t->priority.prio; - desired_prio.sched_policy = t->priority.sched_policy; + if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) { + desired_prio.prio = NICE_TO_PRIO(0); + desired_prio.sched_policy = SCHED_NORMAL; + } else { + desired_prio.prio = t->priority.prio; + desired_prio.sched_policy = t->priority.sched_policy; + } if (node_prio.prio < t->priority.prio || (node_prio.prio == t->priority.prio && @@ -1283,6 +1292,7 @@ static struct binder_node *binder_init_node_ilocked( FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT; node->min_priority = to_kernel_prio(node->sched_policy, priority); node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); + node->inherit_rt = !!(flags & FLAT_BINDER_FLAG_INHERIT_RT); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); INIT_LIST_HEAD(&node->async_todo); @@ -2702,7 +2712,8 @@ static bool binder_proc_transaction(struct binder_transaction *t, if (thread) { target_list = &thread->todo; - binder_transaction_priority(thread->task, t, node_prio); + binder_transaction_priority(thread->task, t, node_prio, + node->inherit_rt); } else if (!target_list) { target_list = &proc->todo; } else { @@ -4100,7 +4111,8 @@ retry: tr.cookie = target_node->cookie; node_prio.sched_policy = target_node->sched_policy; node_prio.prio = target_node->min_priority; - binder_transaction_priority(current, t, node_prio); + binder_transaction_priority(current, t, node_prio, + target_node->inherit_rt); cmd = BR_TRANSACTION; } else { tr.target.ptr = 0; diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 026558ac254d..70e252bf0be0 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -79,6 +79,14 @@ enum flat_binder_object_flags { */ FLAT_BINDER_FLAG_SCHED_POLICY_MASK = 3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT, + + /** + * @FLAT_BINDER_FLAG_INHERIT_RT: whether the node inherits RT policy + * + * Only when set, calls into this node will inherit a real-time + * scheduling policy from the caller (for synchronous transactions). + */ + FLAT_BINDER_FLAG_INHERIT_RT = 0x800, }; #ifdef BINDER_IPC_32BIT From bab9c2fbe48e9ebe6af7fa19ade1bf558dcc2eac Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 25 May 2017 15:04:04 +0100 Subject: [PATCH 0941/3220] UPSTREAM: cpufreq: schedutil: Avoid indented labels Switch to the more common practice of writing labels. Suggested-by: Peter Zijlstra Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki (cherry picked from commit 8e2ddb03643eb9d0bc4926946d7ce0d308eef0a5) Signed-off-by: Chris Redpath Change-Id: Ida75c99cf3dff5cae24d3866454c83bcdb3385b9 --- kernel/sched/cpufreq_schedutil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 75bfbb336722..ff9da5144d8a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -623,13 +623,13 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - out: +out: mutex_unlock(&global_tunables_lock); cpufreq_enable_fast_switch(policy); return 0; - fail: +fail: policy->governor_data = NULL; sugov_tunables_free(tunables); From ceed1eb2b409d74ca694cbf4cdee536476a280c8 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 25 May 2017 15:06:27 +0100 Subject: [PATCH 0942/3220] UPSTREAM: cpufreq: schedutil: enable fast switch earlier The fast_switch_enabled flag will be used by both sugov_policy_alloc() and sugov_policy_free() with a later patch. Prepare for that by moving the calls to enable and disable it to the beginning of sugov_init() and end of sugov_exit(). Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki (cherry picked from commit 4a71ce4348bb61740d411822357061f8bf870f4c) Signed-off-by: Chris Redpath Change-Id: Ia174f423ca02d59360657ac2e77a5098ce5cf99c --- kernel/sched/cpufreq_schedutil.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index ff9da5144d8a..de24e1983614 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -578,9 +578,13 @@ static int sugov_init(struct cpufreq_policy *policy) if (policy->governor_data) return -EBUSY; + cpufreq_enable_fast_switch(policy); + sg_policy = sugov_policy_alloc(policy); - if (!sg_policy) - return -ENOMEM; + if (!sg_policy) { + ret = -ENOMEM; + goto disable_fast_switch; + } ret = sugov_kthread_create(sg_policy); if (ret) @@ -625,8 +629,6 @@ static int sugov_init(struct cpufreq_policy *policy) out: mutex_unlock(&global_tunables_lock); - - cpufreq_enable_fast_switch(policy); return 0; fail: @@ -640,6 +642,10 @@ free_sg_policy: mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); + +disable_fast_switch: + cpufreq_disable_fast_switch(policy); + pr_err("initialization failed (error %d)\n", ret); return ret; } @@ -650,8 +656,6 @@ static int sugov_exit(struct cpufreq_policy *policy) struct sugov_tunables *tunables = sg_policy->tunables; unsigned int count; - cpufreq_disable_fast_switch(policy); - mutex_lock(&global_tunables_lock); count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); @@ -664,6 +668,7 @@ static int sugov_exit(struct cpufreq_policy *policy) sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); + cpufreq_disable_fast_switch(policy); return 0; } From d9e7d036e7897f0c523fafe596316248783f61f7 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 25 May 2017 15:10:26 +0100 Subject: [PATCH 0943/3220] UPSTREAM: cpufreq: schedutil: irq-work and mutex are only used in slow path Execute the irq-work specific initialization/exit code only when the fast path isn't available. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki (cherry picked from commit 21ef57297b15a49b0c4dd4e7135c1a08e9a29a1c) Signed-off-by: Chris Redpath Change-Id: Icfd68f455ef71846d799fcd2d8ec6aa1bf59573e --- kernel/sched/cpufreq_schedutil.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index de24e1983614..8de497e64a13 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -489,15 +489,12 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) return NULL; sg_policy->policy = policy; - init_irq_work(&sg_policy->irq_work, sugov_irq_work); - mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; } static void sugov_policy_free(struct sugov_policy *sg_policy) { - mutex_destroy(&sg_policy->work_lock); kfree(sg_policy); } @@ -531,6 +528,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) sg_policy->thread = thread; kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + mutex_init(&sg_policy->work_lock); + wake_up_process(thread); return 0; @@ -544,6 +544,7 @@ static void sugov_kthread_stop(struct sugov_policy *sg_policy) flush_kthread_worker(&sg_policy->worker); kthread_stop(sg_policy->thread); + mutex_destroy(&sg_policy->work_lock); } static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) @@ -719,9 +720,10 @@ static int sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); - irq_work_sync(&sg_policy->irq_work); - kthread_cancel_work_sync(&sg_policy->work); - + if (!policy->fast_switch_enabled) { + irq_work_sync(&sg_policy->irq_work); + kthread_cancel_work_sync(&sg_policy->work); + } return 0; } From 69fc75780d1521448ba5288545a5d522a43ad062 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 24 Nov 2016 13:51:11 +0530 Subject: [PATCH 0944/3220] UPSTREAM: cpufreq: schedutil: Rectify comment in sugov_irq_work() function This patch rectifies a comment present in sugov_irq_work() function to follow proper grammar. Suggested-by: Ingo Molnar Signed-off-by: Viresh Kumar Acked-by: Ingo Molnar Signed-off-by: Rafael J. Wysocki (cherry picked from commit d06e622d3d9206e6a2cc45a0f9a3256da8773ff4) Signed-off-by: Chris Redpath Change-Id: Iaf996445d411725639d511432cc424086892a146 --- kernel/sched/cpufreq_schedutil.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 8de497e64a13..b76286a8e118 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -371,15 +371,15 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); /* - * For Real Time and Deadline tasks, schedutil governor shoots the - * frequency to maximum. And special care must be taken to ensure that - * this kthread doesn't result in that. + * For RT and deadline tasks, the schedutil governor shoots the + * frequency to maximum. Special care must be taken to ensure that this + * kthread doesn't result in the same behavior. * * This is (mostly) guaranteed by the work_in_progress flag. The flag is - * updated only at the end of the sugov_work() and before that schedutil - * rejects all other frequency scaling requests. + * updated only at the end of the sugov_work() function and before that + * the schedutil governor rejects all other frequency scaling requests. * - * Though there is a very rare case where the RT thread yields right + * There is a very rare case though, where the RT thread yields right * after the work_in_progress flag is cleared. The effects of that are * neglected for now. */ From 0646dd35928e446d8d2f8075a349b3abb521fa06 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 25 May 2017 15:12:38 +0100 Subject: [PATCH 0945/3220] UPSTREAM: cpufreq: schedutil: move cached_raw_freq to struct sugov_policy cached_raw_freq applies to the entire cpufreq policy and not individual CPUs. Apart from wasting per-cpu memory, it is actually wrong to keep it in struct sugov_cpu as we may end up comparing next_freq with a stale cached_raw_freq of a random CPU. Move cached_raw_freq to struct sugov_policy. Fixes: 5cbea46984d6 (cpufreq: schedutil: map raw required frequency to driver frequency) Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki (cherry-picked from 6c4f0fa643cb9e775dcc976e3db00d649468ff1d) Signed-off-by: Chris Redpath Change-Id: Ie91420f710819b383947f9031da9be1f3bb7f636 --- kernel/sched/cpufreq_schedutil.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index b76286a8e118..a8ca00a02ded 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -47,6 +47,7 @@ struct sugov_policy { s64 up_rate_delay_ns; s64 down_rate_delay_ns; unsigned int next_freq; + unsigned int cached_raw_freq; /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; @@ -63,7 +64,6 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; - unsigned int cached_raw_freq; unsigned long iowait_boost; unsigned long iowait_boost_max; u64 last_update; @@ -180,9 +180,9 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, freq = (freq + (freq >> 2)) * util / max; - if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) return sg_policy->next_freq; - sg_cpu->cached_raw_freq = freq; + sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -687,6 +687,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->next_freq = UINT_MAX; sg_policy->work_in_progress = false; sg_policy->need_freq_update = false; + sg_policy->cached_raw_freq = 0; for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); @@ -697,7 +698,6 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->max = 0; sg_cpu->flags = SCHED_CPUFREQ_DL; sg_cpu->last_update = 0; - sg_cpu->cached_raw_freq = 0; sg_cpu->iowait_boost = 0; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, From cbaccedead5cc87296194c6a668ea38de8d7c17c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 2 Mar 2017 14:03:21 +0530 Subject: [PATCH 0946/3220] UPSTREAM: cpufreq: schedutil: Pass sg_policy to get_next_freq() get_next_freq() uses sg_cpu only to get sg_policy, which the callers of get_next_freq() already have. Pass sg_policy instead of sg_cpu to get_next_freq(), to make it more efficient. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki (cherry picked from commit 655cb1ebff4b7918fc560502c3297af2d3c7d114) Signed-off-by: Chris Redpath Change-Id: Ia210058da32930a6cdb18258aa679cd1a44a747e --- kernel/sched/cpufreq_schedutil.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a8ca00a02ded..9bf525a960f3 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -150,7 +150,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, /** * get_next_freq - Compute a new frequency for a given cpufreq policy. - * @sg_cpu: schedutil cpu object to compute the new frequency for. + * @sg_policy: schedutil policy object to compute the new frequency for. * @util: Current CPU utilization. * @max: CPU capacity. * @@ -170,10 +170,9 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, * next_freq (as calculated above) is returned, subject to policy min/max and * cpufreq driver limitations. */ -static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, - unsigned long max) +static unsigned int get_next_freq(struct sugov_policy *sg_policy, + unsigned long util, unsigned long max) { - struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; @@ -268,7 +267,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, } else { sugov_get_util(&util, &max, time); sugov_iowait_boost(sg_cpu, &util, &max); - next_f = get_next_freq(sg_cpu, util, max); + next_f = get_next_freq(sg_policy, util, max); } sugov_update_commit(sg_policy, time, next_f); } @@ -322,7 +321,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, sugov_iowait_boost(j_sg_cpu, &util, &max); } - return get_next_freq(sg_cpu, util, max); + return get_next_freq(sg_policy, util, max); } static void sugov_update_shared(struct update_util_data *hook, u64 time, From 7378c38a80fc434f8d6cf61167d843ca388468b5 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 25 May 2017 15:15:33 +0100 Subject: [PATCH 0947/3220] UPSTREAM: cpufreq: schedutil: Fix per-CPU structure initialization in sugov_start() sugov_start() only initializes struct sugov_cpu per-CPU structures for shared policies, but it should do that for single-CPU policies too. That in particular makes the IO-wait boost mechanism work in the cases when cpufreq policies correspond to individual CPUs. Fixes: 21ca6d2c52f8 (cpufreq: schedutil: Add iowait boosting) Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Cc: 4.9+ # 4.9+ (cherry picked from commit 4296f23ed49a15d36949458adcc66ff993dee2a8) (we use SCHED_CPUFREQ_DL instead of SCHED_CPUFREQ_RT in cpu->flags) Signed-off-by: Chris Redpath Change-Id: I5b837a0ee4432115d85caa1a9808ea61e1e1b07f --- kernel/sched/cpufreq_schedutil.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 9bf525a960f3..3a9283640f84 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -691,20 +691,14 @@ static int sugov_start(struct cpufreq_policy *policy) for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->sg_policy = sg_policy; - if (policy_is_shared(policy)) { - sg_cpu->util = 0; - sg_cpu->max = 0; - sg_cpu->flags = SCHED_CPUFREQ_DL; - sg_cpu->last_update = 0; - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - sugov_update_shared); - } else { - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - sugov_update_single); - } + sg_cpu->flags = SCHED_CPUFREQ_DL; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + policy_is_shared(policy) ? + sugov_update_shared : + sugov_update_single); } return 0; } From a8a200d83b1ff334a6c607b8fda848551648689b Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 25 May 2017 15:22:59 +0100 Subject: [PATCH 0948/3220] UPSTREAM: cpufreq: schedutil: Refactor sugov_next_freq_shared() The loop in sugov_next_freq_shared() contains an if block to skip the loop for the current CPU. This turns out to be an unnecessary conditional in the scheduler's hot-path for every CPU in the policy. It would be better to drop the conditional and make the loop treat all the CPUs in the same way. That would eliminate the need of calling sugov_iowait_boost() at the top of the routine. To keep the code optimized to return early if the current CPU has RT/DL flags set, move the flags check to sugov_update_shared() instead in order to avoid the function call entirely. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki (cherry picked from commit cba1dfb57b94c234728b689d9b00d4267fa1a879) (modified for SCHED_CPUFREQ_DL vs SCHED_CPUFREQ_RT) Signed-off-by: Chris Redpath Change-Id: Ie046fdc8eda46821356750edd0fb6f7d077af363 --- kernel/sched/cpufreq_schedutil.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3a9283640f84..e1a2ed98991f 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -272,30 +272,19 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, - unsigned long util, unsigned long max, - unsigned int flags) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned int max_f = policy->cpuinfo.max_freq; u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned long util = 0, max = 1; unsigned int j; - if (flags & SCHED_CPUFREQ_DL) - return max_f; - - sugov_iowait_boost(sg_cpu, &util, &max); - for_each_cpu(j, policy->cpus) { - struct sugov_cpu *j_sg_cpu; + struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; s64 delta_ns; - if (j == smp_processor_id()) - continue; - - j_sg_cpu = &per_cpu(sugov_cpu, j); /* * If the CPU utilization was last updated before the previous * frequency update and the time elapsed between the last update @@ -309,7 +298,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_DL) - return max_f; + return policy->cpuinfo.max_freq; j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; @@ -344,7 +333,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); + if (flags & SCHED_CPUFREQ_DL) + next_f = sg_policy->policy->cpuinfo.max_freq; + else + next_f = sugov_next_freq_shared(sg_cpu); + sugov_update_commit(sg_policy, time, next_f); } From 537d19226a8c4d36cc376f6b177576dff989a97d Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 25 May 2017 15:24:58 +0100 Subject: [PATCH 0949/3220] UPSTREAM: cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely The way the schedutil governor uses the PELT metric causes it to underestimate the CPU utilization in some cases. That can be easily demonstrated by running kernel compilation on a Sandy Bridge Intel processor, running turbostat in parallel with it and looking at the values written to the MSR_IA32_PERF_CTL register. Namely, the expected result would be that when all CPUs were 100% busy, all of them would be requested to run in the maximum P-state, but observation shows that this clearly isn't the case. The CPUs run in the maximum P-state for a while and then are requested to run slower and go back to the maximum P-state after a while again. That causes the actual frequency of the processor to visibly oscillate below the sustainable maximum in a jittery fashion which clearly is not desirable. That has been attributed to CPU utilization metric updates on task migration that cause the total utilization value for the CPU to be reduced by the utilization of the migrated task. If that happens, the schedutil governor may see a CPU utilization reduction and will attempt to reduce the CPU frequency accordingly right away. That may be premature, though, for example if the system is generally busy and there are other runnable tasks waiting to be run on that CPU already. This is unlikely to be an issue on systems where cpufreq policies are shared between multiple CPUs, because in those cases the policy utilization is computed as the maximum of the CPU utilization values over the whole policy and if that turns out to be low, reducing the frequency for the policy most likely is a good idea anyway. On systems with one CPU per policy, however, it may affect performance adversely and even lead to increased energy consumption in some cases. On those systems it may be addressed by taking another utilization metric into consideration, like whether or not the CPU whose frequency is about to be reduced has been idle recently, because if that's not the case, the CPU is likely to be busy in the near future and its frequency should not be reduced. To that end, use the counter of idle calls in the timekeeping code. Namely, make the schedutil governor look at that counter for the current CPU every time before its frequency is about to be reduced. If the counter has not changed since the previous iteration of the governor computations for that CPU, the CPU has been busy for all that time and its frequency should not be decreased, so if the new frequency would be lower than the one set previously, the governor will skip the frequency update. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Reviewed-by: Joel Fernandes (cherry picked from commit b7eaf1aab9f8bd2e49fceed77ebc66c1b5800718) (simple CPUFREQ_RT_DL vs CPUFREQ_DL usage conflicts) Signed-off-by: Chris Redpath Change-Id: I531ec02c052944ee07a904dc2a25c59948ee762b --- include/linux/tick.h | 1 + kernel/sched/cpufreq_schedutil.c | 27 +++++++++++++++++++++++++++ kernel/time/tick-sched.c | 12 ++++++++++++ 3 files changed, 40 insertions(+) diff --git a/include/linux/tick.h b/include/linux/tick.h index e312219ff823..af5ac7f91a3b 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -103,6 +103,7 @@ extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); +extern unsigned long tick_nohz_get_idle_calls(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e1a2ed98991f..49922187b57c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -72,6 +72,11 @@ struct sugov_cpu { unsigned long util; unsigned long max; unsigned int flags; + + /* The field below is for single-CPU policies only. */ +#ifdef CONFIG_NO_HZ_COMMON + unsigned long saved_idle_calls; +#endif }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); @@ -247,6 +252,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, sg_cpu->iowait_boost >>= 1; } +#ifdef CONFIG_NO_HZ_COMMON +static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +{ + unsigned long idle_calls = tick_nohz_get_idle_calls(); + bool ret = idle_calls == sg_cpu->saved_idle_calls; + + sg_cpu->saved_idle_calls = idle_calls; + return ret; +} +#else +static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } +#endif /* CONFIG_NO_HZ_COMMON */ + static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int flags) { @@ -255,6 +273,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct cpufreq_policy *policy = sg_policy->policy; unsigned long util, max; unsigned int next_f; + bool busy; sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -262,12 +281,20 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; + busy = sugov_cpu_is_busy(sg_cpu); + if (flags & SCHED_CPUFREQ_DL) { next_f = policy->cpuinfo.max_freq; } else { sugov_get_util(&util, &max, time); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. + */ + if (busy && next_f < sg_policy->next_freq) + next_f = sg_policy->next_freq; } sugov_update_commit(sg_policy, time, next_f); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 22c57e191a23..34f9a9c417d9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -870,6 +870,18 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +/** + * tick_nohz_get_idle_calls - return the current idle calls counter value + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->idle_calls; +} + static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE From 2ee9941b0bbcc6a9b047d78bcb9f18be98a15625 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Thu, 25 May 2017 15:27:07 +0100 Subject: [PATCH 0950/3220] UPSTREAM: cpufreq: schedutil: Trace frequency only if it has changed sugov_update_commit() calls trace_cpu_frequency() to record the current CPU frequency if it has not changed in the fast switch case to prevent utilities from getting confused (they may report that the CPU is idle if the frequency has not been recorded for too long, for example). However, that may cause the tracepoint to be triggered quite often for no real reason (if the frequency doesn't change, we will not modify the last update time stamp and governor computations may run again shortly when that happens), so don't do that (arguably, it is done to work around a utilities bug anyway). That allows code duplication in sugov_update_commit() to be reduced somewhat too. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar (cherry picked from commit 38d4ea229d25d30be6bf41bcd6cd663a587866ca) (conflicts with sugov_up_down_rate_limit resolved) Signed-off-by: Chris Redpath Change-Id: Ia019dda29b8c1c4cf3553da75c88d066eb5674e9 --- kernel/sched/cpufreq_schedutil.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 49922187b57c..e12309c1b07b 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -132,22 +132,20 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) return; + if (sg_policy->next_freq == next_freq) + return; + + sg_policy->next_freq = next_freq; + sg_policy->last_freq_update_time = time; + if (policy->fast_switch_enabled) { - if (sg_policy->next_freq == next_freq) { - trace_cpu_frequency(policy->cur, smp_processor_id()); - return; - } - sg_policy->next_freq = next_freq; - sg_policy->last_freq_update_time = time; next_freq = cpufreq_driver_fast_switch(policy, next_freq); if (next_freq == CPUFREQ_ENTRY_INVALID) return; policy->cur = next_freq; trace_cpu_frequency(next_freq, smp_processor_id()); - } else if (sg_policy->next_freq != next_freq) { - sg_policy->next_freq = next_freq; - sg_policy->last_freq_update_time = time; + } else { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } From 89ce9d97e661ee44741bd1ab471c4d9a8a27d077 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Tue, 20 Jun 2017 13:54:44 -0700 Subject: [PATCH 0951/3220] Add BINDER_GET_NODE_DEBUG_INFO ioctl The BINDER_GET_NODE_DEBUG_INFO ioctl will return debug info on a node. Each successive call reusing the previous return value will return the next node. The data will be used by libmemunreachable to mark the pointers with kernel references as reachable. Bug: 28275695 Change-Id: Idbbafa648a33822dc023862cd92b51a595cf7c1c Signed-off-by: Colin Cross Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 42 +++++++++++++++++++++++++++++ include/uapi/linux/android/binder.h | 14 ++++++++++ 2 files changed, 56 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 6b24f70dbff8..4247d7d8b8e6 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4548,6 +4548,30 @@ out: return ret; } +static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, + struct binder_node_debug_info *info) { + struct rb_node *n; + binder_uintptr_t ptr = info->ptr; + + memset(info, 0, sizeof(*info)); + + binder_inner_proc_lock(proc); + for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { + struct binder_node *node = rb_entry(n, struct binder_node, + rb_node); + if (node->ptr > ptr) { + info->ptr = node->ptr; + info->cookie = node->cookie; + info->has_strong_ref = node->has_strong_ref; + info->has_weak_ref = node->has_weak_ref; + break; + } + } + binder_inner_proc_unlock(proc); + + return 0; +} + static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; @@ -4615,6 +4639,24 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } break; } + case BINDER_GET_NODE_DEBUG_INFO: { + struct binder_node_debug_info info; + + if (copy_from_user(&info, ubuf, sizeof(info))) { + ret = -EFAULT; + goto err; + } + + ret = binder_ioctl_get_node_debug_info(proc, &info); + if (ret < 0) + goto err; + + if (copy_to_user(ubuf, &info, sizeof(info))) { + ret = -EFAULT; + goto err; + } + break; + } default: ret = -EINVAL; goto err; diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 70e252bf0be0..5539933b3491 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -233,6 +233,19 @@ struct binder_version { #define BINDER_CURRENT_PROTOCOL_VERSION 8 #endif +/* + * Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields. + * Set ptr to NULL for the first call to get the info for the first node, and + * then repeat the call passing the previously returned value to get the next + * nodes. ptr will be 0 when there are no more nodes. + */ +struct binder_node_debug_info { + binder_uintptr_t ptr; + binder_uintptr_t cookie; + __u32 has_strong_ref; + __u32 has_weak_ref; +}; + #define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) #define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) #define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) @@ -240,6 +253,7 @@ struct binder_version { #define BINDER_SET_CONTEXT_MGR _IOW('b', 7, __s32) #define BINDER_THREAD_EXIT _IOW('b', 8, __s32) #define BINDER_VERSION _IOWR('b', 9, struct binder_version) +#define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info) /* * NOTE: Two special error codes you should check for when calling From 76b376eac7a2e9584afb58af87d5451f55b82e3d Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 26 May 2017 10:48:56 -0700 Subject: [PATCH 0952/3220] ANDROID: binder: don't check prio permissions on restore. Because we have disabled RT priority inheritance for the regular binder domain, the following can happen: 1) thread A (prio 98) calls into thread B 2) because RT prio inheritance is disabled, thread B runs at the lowest nice (prio 100) instead 3) thread B calls back into A; A will run at prio 100 for the duration of the transaction 4) When thread A is done with the call from B, we will try to restore the prio back to 98. But, we fail because the process doesn't hold CAP_SYS_NICE, neither is RLIMIT_RT_PRIO set. While the proper fix going forward will be to correctly apply CAP_SYS_NICE or RLIMIT_RT_PRIO, for now it seems reasonable to not check permissions on the restore path. Change-Id: Ibede5960c9b7bb786271c001e405de50be64d944 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 4247d7d8b8e6..29e3e3f21443 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1109,8 +1109,9 @@ static int to_kernel_prio(int policy, int user_priority) return MAX_USER_RT_PRIO - 1 - user_priority; } -static void binder_set_priority(struct task_struct *task, - struct binder_priority desired) +static void binder_do_set_priority(struct task_struct *task, + struct binder_priority desired, + bool verify) { int priority; /* user-space prio value */ bool has_cap_nice; @@ -1123,7 +1124,7 @@ static void binder_set_priority(struct task_struct *task, priority = to_userspace_prio(policy, desired.prio); - if (is_rt_policy(policy) && !has_cap_nice) { + if (verify && is_rt_policy(policy) && !has_cap_nice) { long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO); if (max_rtprio == 0) { @@ -1134,7 +1135,7 @@ static void binder_set_priority(struct task_struct *task, } } - if (is_fair_policy(policy) && !has_cap_nice) { + if (verify && is_fair_policy(policy) && !has_cap_nice) { long min_nice = rlimit_to_nice(task_rlimit(task, RLIMIT_NICE)); if (min_nice > MAX_NICE) { @@ -1167,6 +1168,18 @@ static void binder_set_priority(struct task_struct *task, set_user_nice(task, priority); } +static void binder_set_priority(struct task_struct *task, + struct binder_priority desired) +{ + binder_do_set_priority(task, desired, /* verify = */ true); +} + +static void binder_restore_priority(struct task_struct *task, + struct binder_priority desired) +{ + binder_do_set_priority(task, desired, /* verify = */ false); +} + static void binder_transaction_priority(struct task_struct *task, struct binder_transaction *t, struct binder_priority node_prio, @@ -3204,7 +3217,7 @@ static void binder_transaction(struct binder_proc *proc, binder_enqueue_work_ilocked(&t->work, &target_thread->todo); binder_inner_proc_unlock(target_proc); wake_up_interruptible_sync(&target_thread->wait); - binder_set_priority(current, in_reply_to->saved_priority); + binder_restore_priority(current, in_reply_to->saved_priority); binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); @@ -3293,7 +3306,7 @@ err_no_context_mgr_node: BUG_ON(thread->return_error.cmd != BR_OK); if (in_reply_to) { - binder_set_priority(current, in_reply_to->saved_priority); + binder_restore_priority(current, in_reply_to->saved_priority); thread->return_error.cmd = BR_TRANSACTION_COMPLETE; binder_enqueue_work(thread->proc, &thread->return_error.work, @@ -3893,7 +3906,7 @@ retry: wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); } - binder_set_priority(current, proc->default_priority); + binder_restore_priority(current, proc->default_priority); } if (non_block) { From d01a860b54f7fe60ae199d05c6e463be8e616bc1 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 19 Jul 2017 17:16:43 -0700 Subject: [PATCH 0953/3220] ANDROID: sdcardfs: Remove unnecessary lock The mmap_sem lock does not appear to be protecting anything, and has been removed in Samsung's more recent versions of sdcardfs. Signed-off-by: Daniel Rosenberg Change-Id: I76ff3e33002716b8384fc8be368028ed63dffe4e Bug: 63785372 --- fs/sdcardfs/inode.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 60fea424835f..103dc45a131f 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -766,13 +766,9 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct * afterwards in the other cases: we fsstack_copy_inode_size from * the lower level. */ - if (current->mm) - down_write(¤t->mm->mmap_sem); if (ia->ia_valid & ATTR_SIZE) { err = inode_newsize_ok(&tmp, ia->ia_size); if (err) { - if (current->mm) - up_write(¤t->mm->mmap_sem); goto out; } truncate_setsize(inode, ia->ia_size); @@ -795,8 +791,6 @@ static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct err = notify_change2(lower_mnt, lower_dentry, &lower_ia, /* note: lower_ia */ NULL); mutex_unlock(&d_inode(lower_dentry)->i_mutex); - if (current->mm) - up_write(¤t->mm->mmap_sem); if (err) goto out; From 492a6047e7b5a1383627189e8bff99ff181ef4a4 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 3 Jun 2016 13:16:59 -0700 Subject: [PATCH 0954/3220] ANDROID: android-verity: mark dev as rw for linear target Mark as rw when adding as linear target to allow changes to the underlying filesystem through adb disable verity and adb remount. (Cherry-picked from https://partner-android-review.googlesource.com/#/c/613573/ 79a3032bb62da65a5d724eb70c8bdc662945d475) BUG: 28845874 Signed-off-by: Badhri Jagan Sridharan Change-Id: If41e9cad8e0f054f4778c09a6e2f0cb8af6fddaf --- drivers/md/dm-android-verity.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c index c3c9502baf18..c521df010ee3 100644 --- a/drivers/md/dm-android-verity.c +++ b/drivers/md/dm-android-verity.c @@ -645,6 +645,8 @@ static int add_as_linear_device(struct dm_target *ti, char *dev) android_verity_target.iterate_devices = dm_linear_iterate_devices, android_verity_target.io_hints = NULL; + set_disk_ro(dm_disk(dm_table_get_md(ti->table)), 0); + err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args); if (!err) { From 362e08d2572fd592b6a5322763977d898ebefba2 Mon Sep 17 00:00:00 2001 From: Daniel Mentz Date: Fri, 7 Jul 2017 11:27:31 -0700 Subject: [PATCH 0955/3220] Revert "proc: smaps: Allow smaps access for CAP_SYS_RESOURCE" This reverts commit 9d19f72b43f495f6f1ef1268dbed1bbade8dea24. This fixes CVE-2017-0710. SELinux allows more fine grained control: We grant processes that need access to smaps CAP_SYS_PTRACE but prohibit them from using ptrace attach(). Bug: 34951864 Bug: 36468447 Change-Id: I8ea67f8771ec212950bc251ee750bd8a7e7c0643 Signed-off-by: Daniel Mentz --- kernel/fork.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 968917653c2c..68cfda1c1800 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -827,8 +827,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (mm && mm != current->mm && - !ptrace_may_access(task, mode) && - !capable(CAP_SYS_RESOURCE)) { + !ptrace_may_access(task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } From ca2ecff8649ee9db3d00fca5fc20e5a972fde18f Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 21 Jul 2017 09:41:06 -0700 Subject: [PATCH 0956/3220] ANDROID: lowmemorykiller: Add tgid to kill message Bug: 33346201 Test: trigger LMK and verify tgid is there Change-Id: I047abc3aa541522766d2a84ebb4c77caf57a18a3 Signed-off-by: Joel Fernandes --- drivers/staging/android/lowmemorykiller.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 606d13a1ea74..2f4ef2120d31 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -177,11 +177,11 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) mark_oom_victim(selected); task_unlock(selected); trace_lowmemory_kill(selected, cache_size, cache_limit, free); - lowmem_print(1, "Killing '%s' (%d), adj %hd,\n" \ + lowmem_print(1, "Killing '%s' (%d) (tgid %d), adj %hd,\n" \ " to free %ldkB on behalf of '%s' (%d) because\n" \ " cache %ldkB is below limit %ldkB for oom_score_adj %hd\n" \ " Free memory is %ldkB above reserved\n", - selected->comm, selected->pid, + selected->comm, selected->pid, selected->tgid, selected_oom_score_adj, selected_tasksize * (long)(PAGE_SIZE / 1024), current->comm, current->pid, From 3715386152f1cd1585273af3ff950d4b0378e524 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 13 Apr 2017 18:35:59 +0800 Subject: [PATCH 0957/3220] UPSTREAM: af_key: Fix sadb_x_ipsecrequest parsing (cherry picked from commit 096f41d3a8fcbb8dde7f71379b1ca85fe213eded) The parsing of sadb_x_ipsecrequest is broken in a number of ways. First of all we're not verifying sadb_x_ipsecrequest_len. This is needed when the structure carries addresses at the end. Worse we don't even look at the length when we parse those optional addresses. The migration code had similar parsing code that's better but it also has some deficiencies. The length is overcounted first of all as it includes the header itself. It also fails to check the length before dereferencing the sa_family field. This patch fixes those problems in parse_sockaddr_pair and then uses it in parse_ipsecrequest. Reported-by: Andrey Konovalov Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert Bug: 63963140 Change-Id: Ie8854131293bc8153bfb8cb255161ec8f03667f9 --- net/key/af_key.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index e67c28e614b9..d8d95b6415e4 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -65,6 +65,10 @@ struct pfkey_sock { } dump; }; +static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, + xfrm_address_t *saddr, xfrm_address_t *daddr, + u16 *family); + static inline struct pfkey_sock *pfkey_sk(struct sock *sk) { return (struct pfkey_sock *)sk; @@ -1922,19 +1926,14 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) /* addresses present only in tunnel mode */ if (t->mode == XFRM_MODE_TUNNEL) { - u8 *sa = (u8 *) (rq + 1); - int family, socklen; + int err; - family = pfkey_sockaddr_extract((struct sockaddr *)sa, - &t->saddr); - if (!family) - return -EINVAL; - - socklen = pfkey_sockaddr_len(family); - if (pfkey_sockaddr_extract((struct sockaddr *)(sa + socklen), - &t->id.daddr) != family) - return -EINVAL; - t->encap_family = family; + err = parse_sockaddr_pair( + (struct sockaddr *)(rq + 1), + rq->sadb_x_ipsecrequest_len - sizeof(*rq), + &t->saddr, &t->id.daddr, &t->encap_family); + if (err) + return err; } else t->encap_family = xp->family; @@ -1954,7 +1953,11 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy)) return -EINVAL; - while (len >= sizeof(struct sadb_x_ipsecrequest)) { + while (len >= sizeof(*rq)) { + if (len < rq->sadb_x_ipsecrequest_len || + rq->sadb_x_ipsecrequest_len < sizeof(*rq)) + return -EINVAL; + if ((err = parse_ipsecrequest(xp, rq)) < 0) return err; len -= rq->sadb_x_ipsecrequest_len; @@ -2417,7 +2420,6 @@ out: return err; } -#ifdef CONFIG_NET_KEY_MIGRATE static int pfkey_sockaddr_pair_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2); @@ -2429,7 +2431,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, { int af, socklen; - if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) + if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) return -EINVAL; af = pfkey_sockaddr_extract(sa, saddr); @@ -2445,6 +2447,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, return 0; } +#ifdef CONFIG_NET_KEY_MIGRATE static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, struct xfrm_migrate *m) { @@ -2452,13 +2455,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, struct sadb_x_ipsecrequest *rq2; int mode; - if (len <= sizeof(struct sadb_x_ipsecrequest) || - len < rq1->sadb_x_ipsecrequest_len) + if (len < sizeof(*rq1) || + len < rq1->sadb_x_ipsecrequest_len || + rq1->sadb_x_ipsecrequest_len < sizeof(*rq1)) return -EINVAL; /* old endoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1), - rq1->sadb_x_ipsecrequest_len, + rq1->sadb_x_ipsecrequest_len - sizeof(*rq1), &m->old_saddr, &m->old_daddr, &m->old_family); if (err) @@ -2467,13 +2471,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len); len -= rq1->sadb_x_ipsecrequest_len; - if (len <= sizeof(struct sadb_x_ipsecrequest) || - len < rq2->sadb_x_ipsecrequest_len) + if (len <= sizeof(*rq2) || + len < rq2->sadb_x_ipsecrequest_len || + rq2->sadb_x_ipsecrequest_len < sizeof(*rq2)) return -EINVAL; /* new endpoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1), - rq2->sadb_x_ipsecrequest_len, + rq2->sadb_x_ipsecrequest_len - sizeof(*rq2), &m->new_saddr, &m->new_daddr, &m->new_family); if (err) From 5680f23f20c73f6348fe73dc23a025a965d69e28 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 29 Mar 2017 09:01:06 +0100 Subject: [PATCH 0958/3220] sched/fair: streamline find_best_target heuristics The find_best_target() code has evolved over time to integrate different micro-optimizations to the point to be quite difficult now to follow exactly what it's doing. This patch rafactors the existing code to make it more readable and easy to maintain. It does that by properly identifying the three main use-cases and addressing them in priority order: A) latency sensitive tasks B) non latency sensitive tasks on IDLE CPUs C) non latency sensitive tasks on ACTIVE CPUs The original behaviors are preserved. Some tests to compare power/performances before and after this patch have been done using Jankbench and YouTube and we did not noticed sensible differences. The only difference with respect of the original code is a small update to favor lower-capacity idle CPUs in case B. The same preference is not enforce in case A since this can lead to a selection of a non-reserved CPU for TOP_APP tasks, which ultimately can lead to non desirable co-scheduling side-effects. Change-Id: I871e5d95af89176217e4e239b64d44a420baabe8 Signed-off-by: Patrick Bellasi (removed checkpatch whitespace error) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 262 +++++++++++++++++++++++++++++++++----------- kernel/sched/walt.h | 2 + 2 files changed, 197 insertions(+), 67 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 226d1990eea9..a61c47a030a0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6189,46 +6189,54 @@ static int start_cpu(bool boosted) static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) { - int target_cpu = -1; - unsigned long target_util = prefer_idle ? ULONG_MAX : 0; - unsigned long backup_capacity = ULONG_MAX; - int best_idle_cpu = -1; - int best_idle_cstate = INT_MAX; - int backup_cpu = -1; + unsigned long best_idle_min_cap_orig = ULONG_MAX; unsigned long min_util = boosted_task_util(p); + unsigned long target_capacity = ULONG_MAX; + unsigned long min_wake_util = ULONG_MAX; + unsigned long target_max_spare_cap = 0; + unsigned long target_util = ULONG_MAX; + unsigned long best_active_util = ULONG_MAX; + int best_idle_cstate = INT_MAX; struct sched_domain *sd; struct sched_group *sg; - int cpu = start_cpu(boosted); + int best_active_cpu = -1; + int best_idle_cpu = -1; + int target_cpu = -1; + int cpu, i; schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts); schedstat_inc(this_rq(), eas_stats.fbt_attempts); + /* Find start CPU based on boost value */ + cpu = start_cpu(boosted); if (cpu < 0) { schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu); schedstat_inc(this_rq(), eas_stats.fbt_no_cpu); - return target_cpu; + return -1; } + /* Find SD for the start CPU */ sd = rcu_dereference(per_cpu(sd_ea, cpu)); - if (!sd) { schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd); schedstat_inc(this_rq(), eas_stats.fbt_no_sd); - return target_cpu; + return -1; } + /* Scan CPUs in all SDs */ sg = sd->groups; - do { - int i; - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { - unsigned long cur_capacity, new_util, wake_util; - unsigned long min_wake_util = ULONG_MAX; + unsigned long capacity_curr = capacity_curr_of(i); + unsigned long capacity_orig = capacity_orig_of(i); + unsigned long wake_util, new_util; if (!cpu_online(i)) continue; + if (walt_cpu_high_irqload(i)) + continue; + /* * p's blocked utilization is still accounted for on prev_cpu * so prev_cpu will receive a negative bias due to the double @@ -6243,70 +6251,190 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre * than the one required to boost the task. */ new_util = max(min_util, new_util); - - if (new_util > capacity_orig_of(i)) + if (new_util > capacity_orig) continue; -#ifdef CONFIG_SCHED_WALT - if (walt_cpu_high_irqload(i)) - continue; -#endif - /* - * Unconditionally favoring tasks that prefer idle cpus to + * Case A) Latency sensitive tasks + * + * Unconditionally favoring tasks that prefer idle CPU to * improve latency. + * + * Looking for: + * - an idle CPU, whatever its idle_state is, since + * the first CPUs we explore are more likely to be + * reserved for latency sensitive tasks. + * - a non idle CPU where the task fits in its current + * capacity and has the maximum spare capacity. + * - a non idle CPU with lower contention from other + * tasks and running at the lowest possible OPP. + * + * The last two goals tries to favor a non idle CPU + * where the task can run as if it is "almost alone". + * A maximum spare capacity CPU is favoured since + * the task already fits into that CPU's capacity + * without waiting for an OPP chance. + * + * The following code path is the only one in the CPUs + * exploration loop which is always used by + * prefer_idle tasks. It exits the loop with wither a + * best_active_cpu or a target_cpu which should + * represent an optimal choice for latency sensitive + * tasks. */ - if (idle_cpu(i) && prefer_idle) { - schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); - schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); - return i; - } + if (prefer_idle) { - cur_capacity = capacity_curr_of(i); - - if (new_util < cur_capacity) { - if (cpu_rq(i)->nr_running) { - /* - * Find a target cpu with the lowest/highest - * utilization if prefer_idle/!prefer_idle. - */ - if (prefer_idle) { - /* Favor the CPU that last ran the task */ - if (new_util > target_util || - wake_util > min_wake_util) - continue; - min_wake_util = wake_util; - target_util = new_util; - target_cpu = i; - } else if (target_util < new_util) { - target_util = new_util; - target_cpu = i; - } - } else if (!prefer_idle) { - int idle_idx = idle_get_state_idx(cpu_rq(i)); - - if (best_idle_cpu < 0 || - (sysctl_sched_cstate_aware && - best_idle_cstate > idle_idx)) { - best_idle_cstate = idle_idx; - best_idle_cpu = i; - } + /* + * Case A.1: IDLE CPU + * Return the first IDLE CPU we find. + */ + if (idle_cpu(i)) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); + schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); + return i; } - } else if (backup_capacity > cur_capacity) { - /* Find a backup cpu with least capacity. */ - backup_capacity = cur_capacity; - backup_cpu = i; + + /* + * Case A.2: Target ACTIVE CPU + * Favor CPUs with max spare capacity. + */ + if ((capacity_curr > new_util) && + (capacity_orig - new_util > target_max_spare_cap)) { + target_max_spare_cap = capacity_orig - new_util; + target_cpu = i; + continue; + } + if (target_cpu != -1) + continue; + + + /* + * Case A.3: Backup ACTIVE CPU + * Favor CPUs with: + * - lower utilization due to other tasks + * - lower utilization with the task in + */ + if (wake_util > min_wake_util) + continue; + if (new_util > best_active_util) + continue; + min_wake_util = wake_util; + best_active_util = new_util; + best_active_cpu = i; + continue; } + + /* + * Case B) Non latency sensitive tasks on IDLE CPUs. + * + * Find an optimal backup IDLE CPU for non latency + * sensitive tasks. + * + * Looking for: + * - minimizing the capacity_orig, + * i.e. preferring LITTLE CPUs + * - favoring shallowest idle states + * i.e. avoid to wakeup deep-idle CPUs + * + * The following code path is used by non latency + * sensitive tasks if IDLE CPUs are available. If at + * least one of such CPUs are available it sets the + * best_idle_cpu to the most suitable idle CPU to be + * selected. + * + * If idle CPUs are available, favour these CPUs to + * improve performances by spreading tasks. + * Indeed, the energy_diff() computed by the caller + * will take care to ensure the minimization of energy + * consumptions without affecting performance. + */ + if (idle_cpu(i)) { + int idle_idx = idle_get_state_idx(cpu_rq(i)); + + /* Select idle CPU with lower cap_orig */ + if (capacity_orig > best_idle_min_cap_orig) + continue; + + /* + * Skip CPUs in deeper idle state, but only + * if they are also less energy efficient. + * IOW, prefer a deep IDLE LITTLE CPU vs a + * shallow idle big CPU. + */ + if (sysctl_sched_cstate_aware && + best_idle_cstate <= idle_idx) + continue; + + /* Keep track of best idle CPU */ + best_idle_min_cap_orig = capacity_orig; + best_idle_cstate = idle_idx; + best_idle_cpu = i; + continue; + } + + /* + * Case C) Non latency sensitive tasks on ACTIVE CPUs. + * + * Pack tasks in the most energy efficient capacities. + * + * This task packing strategy prefers more energy + * efficient CPUs (i.e. pack on smaller maximum + * capacity CPUs) while also trying to spread tasks to + * run them all at the lower OPP. + * + * This assumes for example that it's more energy + * efficient to run two tasks on two CPUs at a lower + * OPP than packing both on a single CPU but running + * that CPU at an higher OPP. + * + * Thus, this case keep track of the CPU with the + * smallest maximum capacity and highest spare maximum + * capacity. + */ + + /* Favor CPUs with smaller capacity */ + if (capacity_orig > target_capacity) + continue; + + /* Favor CPUs with maximum spare capacity */ + if ((capacity_orig - new_util) < target_max_spare_cap) + continue; + + target_max_spare_cap = capacity_orig - new_util; + target_capacity = capacity_orig; + target_util = new_util; + target_cpu = i; } + } while (sg = sg->next, sg != sd->groups); - if (target_cpu < 0) - target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; + /* + * For non latency sensitive tasks, cases B and C in the previous loop, + * we pick the best IDLE CPU only if we was not able to find a target + * ACTIVE CPU. + * + * Policies priorities: + * + * - prefer_idle tasks: + * + * a) IDLE CPU available, we return immediately + * b) ACTIVE CPU where task fits and has the bigger maximum spare + * capacity (i.e. target_cpu) + * c) ACTIVE CPU with less contention due to other tasks + * (i.e. best_active_cpu) + * + * - NON prefer_idle tasks: + * + * a) ACTIVE CPU: target_cpu + * b) IDLE CPU: best_idle_cpu + */ + if (target_cpu == -1) + target_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; - if (target_cpu >= 0) { - schedstat_inc(p, se.statistics.nr_wakeups_fbt_count); - schedstat_inc(this_rq(), eas_stats.fbt_count); - } + schedstat_inc(p, se.statistics.nr_wakeups_fbt_count); + schedstat_inc(this_rq(), eas_stats.fbt_count); return target_cpu; } diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index e181c87a928d..f56c4da16d0b 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -55,6 +55,8 @@ static inline void walt_migrate_sync_cpu(int cpu) { } static inline void walt_init_cpu_efficiency(void) { } static inline u64 walt_ktime_clock(void) { return 0; } +#define walt_cpu_high_irqload(cpu) false + #endif /* CONFIG_SCHED_WALT */ extern unsigned int walt_disabled; From bf6cd4d156b7b4ef09d00de92616eef49bb0efc7 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 29 Jun 2017 12:24:27 +0100 Subject: [PATCH 0959/3220] events: add tracepoint for find_best_target Change-Id: I4c245ffacb207d7ea826c5763a426efe5399e0a2 Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 42 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 10 +++++++++ 2 files changed, 52 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 433d3919ba00..8b97464803f1 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -877,6 +877,48 @@ TRACE_EVENT(sched_boost_task, __entry->margin) ); +/* + * Tracepoint for find_best_target + */ +TRACE_EVENT(sched_find_best_target, + + TP_PROTO(struct task_struct *tsk, bool prefer_idle, + unsigned long min_util, int start_cpu, + int best_idle, int best_active, int target), + + TP_ARGS(tsk, prefer_idle, min_util, start_cpu, + best_idle, best_active, target), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( unsigned long, min_util ) + __field( bool, prefer_idle ) + __field( int, start_cpu ) + __field( int, best_idle ) + __field( int, best_active ) + __field( int, target ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->min_util = min_util; + __entry->prefer_idle = prefer_idle; + __entry->start_cpu = start_cpu; + __entry->best_idle = best_idle; + __entry->best_active = best_active; + __entry->target = target; + ), + + TP_printk("pid=%d comm=%s prefer_idle=%d start_cpu=%d " + "best_idle=%d best_active=%d target=%d", + __entry->pid, __entry->comm, + __entry->prefer_idle, __entry->start_cpu, + __entry->best_idle, __entry->best_active, + __entry->target) +); + /* * Tracepoint for accounting sched group energy */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a61c47a030a0..e6336157feec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6291,6 +6291,12 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre if (idle_cpu(i)) { schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); + + trace_sched_find_best_target(p, + prefer_idle, min_util, + cpu, best_idle_cpu, + best_active_cpu, i); + return i; } @@ -6433,6 +6439,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre ? best_active_cpu : best_idle_cpu; + trace_sched_find_best_target(p, prefer_idle, min_util, cpu, + best_idle_cpu, best_active_cpu, + target_cpu); + schedstat_inc(p, se.statistics.nr_wakeups_fbt_count); schedstat_inc(this_rq(), eas_stats.fbt_count); From 7b63e1ff52134bbc27c6e30c1fd909ff9a87bc2b Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 28 Apr 2017 15:23:33 +0100 Subject: [PATCH 0960/3220] sched/fair: Update signals of nohz cpus if we are going idle Stale cpu utilization signals can cause havoc for energy-aware systems, and they are caused by no updates being performed for cpus which have no tick running. There is open debate about when is the correct time to update these cpus, and general recognition that something needs to be done. This is an attempt to do something useful. When we are looking for a task to pull for a newly-idle cpu, we have an opportunity to update the stats for any cpu which has no tick running without causing too much disturbance to the system or waking it up. Change-Id: I0280104ea9c53e56c26f1c56a62bacab5d3e951b Signed-off-by: Chris Redpath Signed-off-by: Brendan Jackman --- kernel/sched/fair.c | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e6336157feec..dff742f276d7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8051,6 +8051,38 @@ group_type group_classify(struct sched_group *group, return group_other; } +#ifdef CONFIG_NO_HZ_COMMON +/* + * idle load balancing data + * - used by the nohz balance, but we want it available here + * so that we can see which CPUs have no tick. + */ +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; + +static inline void update_cpu_stats_if_tickless(struct rq *rq) +{ + /* only called from update_sg_lb_stats when irqs are disabled */ + if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) { + /* rate limit updates to once-per-jiffie at most */ + if (READ_ONCE(jiffies) <= rq->last_load_update_tick) + return; + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false); + raw_spin_unlock(&rq->lock); + } +} + +#else +static inline void update_cpu_stats_if_tickless(struct rq *rq) { } +#endif + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -8074,6 +8106,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); + /* if we are entering idle and there are CPUs with + * their tick stopped, do an update for them + */ + if (env->idle == CPU_NEWLY_IDLE) + update_cpu_stats_if_tickless(rq); + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -9314,12 +9352,6 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ -} nohz ____cacheline_aligned; - static inline int find_new_ilb(void) { int ilb = cpumask_first(nohz.idle_cpus_mask); From ebc28671a5a3a657c1f88fbde4be07c4ef395aef Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 27 Mar 2017 15:00:14 +0100 Subject: [PATCH 0961/3220] sched/fair: kick nohz idle balance for misfit task If there have misfit task on one CPU, current code does not handle this situation for nohz idle balance. As result, we can see the misfit task stays run on little core for long time. So this patch check if the CPU has misfit task or not. If has misfit task then kick nohz idle balance so finally can execute active balance. Change-Id: I117d3b7404296f8de11cb960a87a6b9a54a9f348 Signed-off-by: Leo Yan [taken from https://lists.linaro.org/pipermail/eas-dev/2016-September/000551.html] Signed-off-by: Chris Redpath Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dff742f276d7..90b44c6cca37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9701,6 +9701,10 @@ static inline bool nohz_kick_needed(struct rq *rq) (!energy_aware() || cpu_overutilized(cpu))) return true; + /* Do idle load balance if there have misfit task */ + if (energy_aware() && rq->misfit_task) + return true; + rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd && !energy_aware()) { From e76348ec5f7f4e37f827cfcee2ead8c1089912c4 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 29 Jun 2017 17:24:29 +0100 Subject: [PATCH 0962/3220] Revert "sched/fair: ensure utilization signals are synchronized before use" This reverts commit 83f462daa328f2f42c3c1f7f5277f71e3fa0f750. Change-Id: I37ba36da61df2beb3a005557d9b673027f446916 Signed-off-by: Brendan Jackman --- kernel/sched/fair.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 90b44c6cca37..04eb6c66a78b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6573,16 +6573,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) { - /* - * do wake_cap unconditionally as it causes task and cpu - * utilization to be synced, and we need that for energy - * aware wakeups - */ - int _wake_cap = wake_cap(p, cpu, prev_cpu); - want_affine = !wake_wide(p) && !_wake_cap + if (sd_flag & SD_BALANCE_WAKE) + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); - } if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) return select_energy_cpu_brute(p, prev_cpu, sync); From d96e40472807e08d8352d22c0d3b0ce142893c53 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 29 Jun 2017 17:29:31 +0100 Subject: [PATCH 0963/3220] sched/fair: Sync task util before EAS wakeup Before using a task's util_avg signal in EAS, we need to ensure that it has been synced up to the last_update_time of prev_cpu's root cfs_rq. We previously relied on the side effect of wake_cap to do that, however that does not happen when the waking CPU has the same capacity as the prev_cpu. Therefore just explicitly call sync_entity_load_avg. This may result in calling that function twice within the same select_task_rq_fair, but since last_update_time hasn't changed the second call will bail out very quickly. Change-Id: I91f1fcd71dfeb96b7f5b73418f1cf9ac311d4655 Signed-off-by: Brendan Jackman Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04eb6c66a78b..cd72ce142974 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6501,6 +6501,8 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync prefer_idle = 0; #endif + sync_entity_load_avg(&p->se); + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); /* Find a cpu with sufficient capacity */ tmp_target = find_best_target(p, boosted, prefer_idle); From 6cb8fcccb26e999a1745d208135c8cea7091639c Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 4 Jul 2017 10:23:03 +0100 Subject: [PATCH 0964/3220] sched/fair: Try to estimate possible idle states. In the current EAS group energy calculations, we only use the idle state of the group as it is right now. This means that there are times when EAS cannot see that we are about to remove all utilization from a group which is likely to result in us being able to idle that entire group. This is an attempt to detect that situation and at least allow the energy calculation to include savings in that scenario, regardless of what we might be able to actually achieve in the real world. If a cluster or cpu looks like it will have some idle time available to it, we try to map the utilization onto an idle state. Change-Id: I8fcb1e507f65ae6a2c5647eeef75a4bf28c7a0c0 Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 55 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cd72ce142974..d1445e7b298a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5396,9 +5396,11 @@ static int find_new_capacity(struct energy_env *eenv, return idx; } -static int group_idle_state(struct sched_group *sg) +static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) { int i, state = INT_MAX; + int src_in_grp, dst_in_grp; + long grp_util = 0; /* Find the shallowest idle state in the sched group. */ for_each_cpu(i, sched_group_cpus(sg)) @@ -5407,6 +5409,54 @@ static int group_idle_state(struct sched_group *sg) /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ state++; + /* + * Try to estimate if a deeper idle state is + * achievable when we move the task. + */ + for_each_cpu(i, sched_group_cpus(sg)) + grp_util += cpu_util(i); + + src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg)); + dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg)); + if (src_in_grp == dst_in_grp) { + /* both CPUs under consideration are in the same group or not in + * either group, migration should leave idle state the same. + */ + goto end; + } + /* add or remove util as appropriate to indicate what group util + * will be (worst case - no concurrent execution) after moving the task + */ + grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta; + + if (grp_util <= + ((long)sg->sgc->max_capacity * (int)sg->group_weight)) { + /* after moving, this group is at most partly + * occupied, so it should have some idle time. + */ + int max_idle_state_idx = sg->sge->nr_idle_states - 2; + int new_state = grp_util * max_idle_state_idx; + if (grp_util <= 0) + /* group will have no util, use lowest state */ + new_state = max_idle_state_idx + 1; + else { + /* for partially idle, linearly map util to idle + * states, excluding the lowest one. This does not + * correspond to the state we expect to enter in + * reality, but an indication of what might happen. + */ + new_state = min(max_idle_state_idx, (int) + (new_state / sg->sgc->max_capacity)); + new_state = max_idle_state_idx - new_state; + } + state = new_state; + } else { + /* After moving, the group will be fully occupied + * so assume it will not be idle at all. + */ + state = 0; + } +end: return state; } @@ -5479,8 +5529,9 @@ static int sched_group_energy(struct energy_env *eenv) } } - idle_idx = group_idle_state(sg); + idle_idx = group_idle_state(eenv, sg); group_util = group_norm_util(eenv, sg); + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) >> SCHED_CAPACITY_SHIFT; sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) From 2e13f308a9854f5ff2b3735f4dc58c2a10cf7ea1 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 4 Jul 2017 10:19:58 +0100 Subject: [PATCH 0965/3220] sched/fair: Add a backup_cpu to find_best_target Sometimes we find a target cpu but then we do not use it as the energy_diff indicates that we would increase energy usage or not save anything. To offer an additional option for those cases, we return a second option which is what we would have selected if the target CPU had not been found. This gives us another chance to try to save some energy. Change-Id: I42c4f20aba10e4cf65b51ac4153e2e00e534c8c7 Signed-off-by: Chris Redpath Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d1445e7b298a..c193e9b1c38f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6238,7 +6238,8 @@ static int start_cpu(bool boosted) return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; } -static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) +static inline int find_best_target(struct task_struct *p, int *backup_cpu, + bool boosted, bool prefer_idle) { unsigned long best_idle_min_cap_orig = ULONG_MAX; unsigned long min_util = boosted_task_util(p); @@ -6255,6 +6256,8 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre int target_cpu = -1; int cpu, i; + *backup_cpu = -1; + schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts); schedstat_inc(this_rq(), eas_stats.fbt_attempts); @@ -6489,6 +6492,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted, bool pre target_cpu = prefer_idle ? best_active_cpu : best_idle_cpu; + else + *backup_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; trace_sched_find_best_target(p, prefer_idle, min_util, cpu, best_idle_cpu, best_active_cpu, @@ -6527,7 +6534,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) { struct sched_domain *sd; - int target_cpu = prev_cpu, tmp_target; + int target_cpu = prev_cpu, tmp_target, tmp_backup; bool boosted, prefer_idle; schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts); @@ -6556,7 +6563,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); /* Find a cpu with sufficient capacity */ - tmp_target = find_best_target(p, boosted, prefer_idle); + tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle); if (!sd) goto unlock; @@ -6585,10 +6592,15 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync } if (energy_diff(&eenv) >= 0) { - schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); - schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); - target_cpu = prev_cpu; - goto unlock; + /* No energy saving for target_cpu, try backup */ + target_cpu = tmp_backup; + eenv.dst_cpu = target_cpu; + if (tmp_backup < 0 || energy_diff(&eenv) >= 0) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); + target_cpu = prev_cpu; + goto unlock; + } } schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); From 7eeebce62c213a3380c61d0e815ccd7ae1d4b410 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Thu, 27 Jul 2017 23:52:24 +0200 Subject: [PATCH 0966/3220] ANDROID: binder: Don't BUG_ON(!spin_is_locked()). Because is_spin_locked() always returns false on UP systems. Use assert_spin_locked() instead, and remove the WARN_ON() instances, since those were easy to verify. Bug: 64073116 Change-Id: I9080991c6d67e91928282a3ee64db23e50c7d66a Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 29e3e3f21443..4cf8e05c7a03 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1014,7 +1014,7 @@ binder_select_thread_ilocked(struct binder_proc *proc) { struct binder_thread *thread; - BUG_ON(!spin_is_locked(&proc->inner_lock)); + assert_spin_locked(&proc->inner_lock); thread = list_first_entry_or_null(&proc->waiting_threads, struct binder_thread, waiting_thread_node); @@ -1045,7 +1045,7 @@ static void binder_wakeup_thread_ilocked(struct binder_proc *proc, struct binder_thread *thread, bool sync) { - BUG_ON(!spin_is_locked(&proc->inner_lock)); + assert_spin_locked(&proc->inner_lock); if (thread) { if (sync) @@ -1224,7 +1224,7 @@ static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, struct rb_node *n = proc->nodes.rb_node; struct binder_node *node; - BUG_ON(!spin_is_locked(&proc->inner_lock)); + assert_spin_locked(&proc->inner_lock); while (n) { node = rb_entry(n, struct binder_node, rb_node); @@ -1270,7 +1270,8 @@ static struct binder_node *binder_init_node_ilocked( __u32 flags = fp ? fp->flags : 0; s8 priority; - BUG_ON(!spin_is_locked(&proc->inner_lock)); + assert_spin_locked(&proc->inner_lock); + while (*p) { parent = *p; @@ -1349,9 +1350,9 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong, { struct binder_proc *proc = node->proc; - BUG_ON(!spin_is_locked(&node->lock)); + assert_spin_locked(&node->lock); if (proc) - BUG_ON(!spin_is_locked(&proc->inner_lock)); + assert_spin_locked(&proc->inner_lock); if (strong) { if (internal) { if (target_list == NULL && @@ -1403,9 +1404,9 @@ static bool binder_dec_node_nilocked(struct binder_node *node, { struct binder_proc *proc = node->proc; - BUG_ON(!spin_is_locked(&node->lock)); + assert_spin_locked(&node->lock); if (proc) - BUG_ON(!spin_is_locked(&proc->inner_lock)); + assert_spin_locked(&proc->inner_lock); if (strong) { if (internal) node->internal_strong_refs--; @@ -1929,7 +1930,7 @@ static void binder_pop_transaction_ilocked(struct binder_thread *target_thread, struct binder_transaction *t) { BUG_ON(!target_thread); - BUG_ON(!spin_is_locked(&target_thread->proc->inner_lock)); + assert_spin_locked(&target_thread->proc->inner_lock); BUG_ON(target_thread->transaction_stack != t); BUG_ON(target_thread->transaction_stack->from != target_thread); target_thread->transaction_stack = @@ -5072,7 +5073,6 @@ static void print_binder_transaction_ilocked(struct seq_file *m, struct binder_proc *to_proc; struct binder_buffer *buffer = t->buffer; - WARN_ON(!spin_is_locked(&proc->inner_lock)); spin_lock(&t->lock); to_proc = t->to_proc; seq_printf(m, @@ -5161,7 +5161,6 @@ static void print_binder_thread_ilocked(struct seq_file *m, size_t start_pos = m->count; size_t header_pos; - WARN_ON(!spin_is_locked(&thread->proc->inner_lock)); seq_printf(m, " thread %d: l %02x need_return %d tr %d\n", thread->pid, thread->looper, thread->looper_need_return, @@ -5198,10 +5197,6 @@ static void print_binder_node_nilocked(struct seq_file *m, struct binder_work *w; int count; - WARN_ON(!spin_is_locked(&node->lock)); - if (node->proc) - WARN_ON(!spin_is_locked(&node->proc->inner_lock)); - count = 0; hlist_for_each_entry(ref, &node->refs, node_entry) count++; @@ -5228,7 +5223,6 @@ static void print_binder_node_nilocked(struct seq_file *m, static void print_binder_ref_olocked(struct seq_file *m, struct binder_ref *ref) { - WARN_ON(!spin_is_locked(&ref->proc->outer_lock)); binder_node_lock(ref->node); seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %pK\n", ref->data.debug_id, ref->data.desc, From b315101a358facc225f4bdeff1adc4b0b95f95c9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Jul 2017 19:12:22 -0700 Subject: [PATCH 0967/3220] ANDROID: sdcardfs: override credential for ioctl to lower fs Otherwise, lower_fs->ioctl() fails due to inode_owner_or_capable(). Signed-off-by: Jaegeuk Kim Bug: 63260873 Change-Id: I623a6c7c5f8a3cbd7ec73ef89e18ddb093c43805 --- fs/sdcardfs/file.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 6076c342dae6..5ac0b0bbb0ec 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -104,12 +104,19 @@ static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd, { long err = -ENOTTY; struct file *lower_file; + const struct cred *saved_cred = NULL; + struct dentry *dentry = file->f_path.dentry; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); lower_file = sdcardfs_lower_file(file); /* XXX: use vfs_ioctl if/when VFS exports it */ if (!lower_file || !lower_file->f_op) goto out; + + /* save current_cred and override it */ + OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(file_inode(file))); + if (lower_file->f_op->unlocked_ioctl) err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); @@ -117,6 +124,7 @@ static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd, if (!err) sdcardfs_copy_and_fix_attrs(file_inode(file), file_inode(lower_file)); + REVERT_CRED(saved_cred); out: return err; } @@ -127,15 +135,23 @@ static long sdcardfs_compat_ioctl(struct file *file, unsigned int cmd, { long err = -ENOTTY; struct file *lower_file; + const struct cred *saved_cred = NULL; + struct dentry *dentry = file->f_path.dentry; + struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb); lower_file = sdcardfs_lower_file(file); /* XXX: use vfs_ioctl if/when VFS exports it */ if (!lower_file || !lower_file->f_op) goto out; + + /* save current_cred and override it */ + OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(file_inode(file))); + if (lower_file->f_op->compat_ioctl) err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); + REVERT_CRED(saved_cred); out: return err; } From 50d3f7d55a4fa7c46a0fec709faeae636e5841fd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 24 Jun 2016 15:09:37 -0700 Subject: [PATCH 0968/3220] UPSTREAM: Clarify naming of thread info/stack allocators We've had the thread info allocated together with the thread stack for most architectures for a long time (since the thread_info was split off from the task struct), but that is about to change. But the patches that move the thread info to be off-stack (and a part of the task struct instead) made it clear how confused the allocator and freeing functions are. Because the common case was that we share an allocation with the thread stack and the thread_info, the two pointers were identical. That identity then meant that we would have things like ti = alloc_thread_info_node(tsk, node); ... tsk->stack = ti; which certainly _worked_ (since stack and thread_info have the same value), but is rather confusing: why are we assigning a thread_info to the stack? And if we move the thread_info away, the "confusing" code just gets to be entirely bogus. So remove all this confusion, and make it clear that we are doing the stack allocation by renaming and clarifying the function names to be about the stack. The fact that the thread_info then shares the allocation is an implementation detail, and not really about the allocation itself. This is a pure renaming and type fix: we pass in the same pointer, it's just that we clarify what the pointer means. The ia64 code that actually only has one single allocation (for all of task_struct, thread_info and kernel thread stack) now looks a bit odd, but since "tsk->stack" is actually not even used there, that oddity doesn't matter. It would be a separate thing to clean that up, I intentionally left the ia64 changes as a pure brute-force renaming and type change. Acked-by: Andy Lutomirski Signed-off-by: Linus Torvalds Bug: 38331309 Change-Id: I870b5476fc900c9145134f9dd3ed18a32a490162 (cherry picked from commit b235beea9e996a4d36fed6cfef4801a3e7d7a9a5) Signed-off-by: Zubin Mithra --- arch/Kconfig | 4 +- arch/ia64/Kconfig | 2 +- arch/ia64/include/asm/thread_info.h | 8 ++-- arch/mn10300/include/asm/thread_info.h | 2 +- arch/mn10300/kernel/kgdb.c | 3 +- arch/tile/include/asm/thread_info.h | 2 +- arch/tile/kernel/process.c | 3 +- include/linux/sched.h | 2 +- init/main.c | 4 +- kernel/fork.c | 52 +++++++++++++------------- 10 files changed, 43 insertions(+), 39 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 98f64ad1caf1..ed2539c590bf 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -225,8 +225,8 @@ config ARCH_INIT_TASK config ARCH_TASK_STRUCT_ALLOCATOR bool -# Select if arch has its private alloc_thread_info() function -config ARCH_THREAD_INFO_ALLOCATOR +# Select if arch has its private alloc_thread_stack() function +config ARCH_THREAD_STACK_ALLOCATOR bool # Select if arch wants to size task_struct dynamically via arch_task_struct_size: diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index eb0249e37981..7534fd34f79d 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -45,7 +45,7 @@ config IA64 select GENERIC_SMP_IDLE_THREAD select ARCH_INIT_TASK select ARCH_TASK_STRUCT_ALLOCATOR - select ARCH_THREAD_INFO_ALLOCATOR + select ARCH_THREAD_STACK_ALLOCATOR select ARCH_CLOCKSOURCE_DATA select GENERIC_TIME_VSYSCALL_OLD select SYSCTL_ARCH_UNALIGN_NO_WARN diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index aa995b67c3f5..d1212b84fb83 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -48,15 +48,15 @@ struct thread_info { #ifndef ASM_OFFSETS_C /* how to get the thread information struct from C */ #define current_thread_info() ((struct thread_info *) ((char *) current + IA64_TASK_SIZE)) -#define alloc_thread_info_node(tsk, node) \ - ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) +#define alloc_thread_stack_node(tsk, node) \ + ((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE)) #define task_thread_info(tsk) ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) #else #define current_thread_info() ((struct thread_info *) 0) -#define alloc_thread_info_node(tsk, node) ((struct thread_info *) 0) +#define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) #define task_thread_info(tsk) ((struct thread_info *) 0) #endif -#define free_thread_info(ti) /* nothing */ +#define free_thread_stack(ti) /* nothing */ #define task_stack_page(tsk) ((void *)(tsk)) #define __HAVE_THREAD_FUNCTIONS diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index 4861a78c7160..f5f90bbf019d 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -115,7 +115,7 @@ static inline unsigned long current_stack_pointer(void) } #ifndef CONFIG_KGDB -void arch_release_thread_info(struct thread_info *ti); +void arch_release_thread_stack(unsigned long *stack); #endif #define get_thread_info(ti) get_task_struct((ti)->task) #define put_thread_info(ti) put_task_struct((ti)->task) diff --git a/arch/mn10300/kernel/kgdb.c b/arch/mn10300/kernel/kgdb.c index 99770823451a..2d7986c386fe 100644 --- a/arch/mn10300/kernel/kgdb.c +++ b/arch/mn10300/kernel/kgdb.c @@ -397,8 +397,9 @@ static bool kgdb_arch_undo_singlestep(struct pt_regs *regs) * single-step state is cleared. At this point the breakpoints should have * been removed by __switch_to(). */ -void arch_release_thread_info(struct thread_info *ti) +void arch_release_thread_stack(unsigned long *stack) { + struct thread_info *ti = (void *)stack; if (kgdb_sstep_thread == ti) { kgdb_sstep_thread = NULL; diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index dc1fb28d9636..489b15016303 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -78,7 +78,7 @@ struct thread_info { #ifndef __ASSEMBLY__ -void arch_release_thread_info(struct thread_info *info); +void arch_release_thread_stack(unsigned long *stack); /* How to get the thread information struct from C. */ register unsigned long stack_pointer __asm__("sp"); diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 7d5769310bef..a97ab1a69a90 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -73,8 +73,9 @@ void arch_cpu_idle(void) /* * Release a thread_info structure */ -void arch_release_thread_info(struct thread_info *info) +void arch_release_thread_stack(unsigned long *stack) { + struct thread_info *info = (void *)stack; struct single_step_state *step_state = info->step_state; if (step_state) { diff --git a/include/linux/sched.h b/include/linux/sched.h index ad2c304b29b8..85cf2d2f02cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2986,7 +2986,7 @@ static inline int object_is_on_stack(void *obj) return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } -extern void thread_info_cache_init(void); +extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE static inline unsigned long stack_not_used(struct task_struct *p) diff --git a/init/main.c b/init/main.c index fbafa271531c..86f5ce9ede86 100644 --- a/init/main.c +++ b/init/main.c @@ -468,7 +468,7 @@ void __init __weak smp_setup_processor_id(void) } # if THREAD_SIZE >= PAGE_SIZE -void __init __weak thread_info_cache_init(void) +void __init __weak thread_stack_cache_init(void) { } #endif @@ -645,7 +645,7 @@ asmlinkage __visible void __init start_kernel(void) /* Should be run before the first non-init thread is created */ init_espfix_bsp(); #endif - thread_info_cache_init(); + thread_stack_cache_init(); cred_init(); fork_init(); proc_caches_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 68cfda1c1800..5ee818516a1c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -147,18 +147,18 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_info(struct thread_info *ti) +void __weak arch_release_thread_stack(unsigned long *stack) { } -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, @@ -167,29 +167,31 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, return page ? page_address(page) : NULL; } -static inline void free_thread_info(struct thread_info *ti) +static inline void free_thread_stack(unsigned long *stack) { - free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + struct page *page = virt_to_page(stack); + + __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else -static struct kmem_cache *thread_info_cache; +static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk, int node) { - return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); + return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_info(struct thread_info *ti) +static void free_stack(unsigned long *stack) { - kmem_cache_free(thread_info_cache, ti); + kmem_cache_free(thread_stack_cache, stack); } -void thread_info_cache_init(void) +void thread_stack_cache_init(void) { - thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, + thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, NULL); - BUG_ON(thread_info_cache == NULL); + BUG_ON(thread_stack_cache == NULL); } # endif #endif @@ -212,9 +214,9 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(struct thread_info *ti, int account) +static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(ti)); + struct zone *zone = page_zone(virt_to_page(stack)); mod_zone_page_state(zone, NR_KERNEL_STACK, account); } @@ -222,8 +224,8 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); - arch_release_thread_info(tsk->stack); - free_thread_info(tsk->stack); + arch_release_thread_stack(tsk->stack); + free_thread_stack(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -334,7 +336,7 @@ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - struct thread_info *ti; + unsigned long *stack; int err; if (node == NUMA_NO_NODE) @@ -343,15 +345,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; - ti = alloc_thread_info_node(tsk, node); - if (!ti) + stack = alloc_thread_stack_node(tsk, node); + if (!stack) goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) - goto free_ti; + goto free_stack; - tsk->stack = ti; + tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -383,12 +385,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(ti, 1); + account_kernel_stack(stack, 1); return tsk; -free_ti: - free_thread_info(ti); +free_stack: + free_thread_stack(stack); free_tsk: free_task_struct(tsk); return NULL; From f707c0f98f32e79c148b6c4284fa1681b441ee6d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 24 Jun 2016 17:07:33 -0700 Subject: [PATCH 0969/3220] UPSTREAM: fix up initial thread stack pointer vs thread_info confusion The INIT_TASK() initializer was similarly confused about the stack vs thread_info allocation that the allocators had, and that were fixed in commit b235beea9e99 ("Clarify naming of thread info/stack allocators"). The task ->stack pointer only incidentally ends up having the same value as the thread_info, and in fact that will change. So fix the initial task struct initializer to point to 'init_stack' instead of 'init_thread_info', and make sure the ia64 definition for that exists. This actually makes the ia64 tsk->stack pointer be sensible for the initial task, but not for any other task. As mentioned in commit b235beea9e99, that whole pointer isn't actually used on ia64, since task_stack_page() there just points to the (single) allocation. All the other architectures seem to have copied the 'init_stack' definition, even if it tended to be generally unusued. Signed-off-by: Linus Torvalds Bug: 38331309 Change-Id: Ia96e9225b07e38df2f4af2b9a7eb2aa972d8845a (cherry picked from commit 7f1a00b6fcd0e3c19beba2e92d157dc0c2cf3494) Signed-off-by: Zubin Mithra --- arch/ia64/kernel/init_task.c | 1 + include/linux/init_task.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c index f9efe9739d3f..0eaa89f3defd 100644 --- a/arch/ia64/kernel/init_task.c +++ b/arch/ia64/kernel/init_task.c @@ -26,6 +26,7 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * handled. This is done by having a special ".data..init_task" section... */ #define init_thread_info init_task_mem.s.thread_info +#define init_stack init_task_mem.stack union { struct { diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1c1ff7e4faa4..9a0056499337 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -190,7 +190,7 @@ extern struct task_group root_task_group; #define INIT_TASK(tsk) \ { \ .state = 0, \ - .stack = &init_thread_info, \ + .stack = init_stack, \ .usage = ATOMIC_INIT(2), \ .flags = PF_KTHREAD, \ .prio = MAX_PRIO-20, \ From 242f841e5485d19b39393ef90c1a133f3a7c990f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jul 2016 15:48:23 -0700 Subject: [PATCH 0970/3220] UPSTREAM: printk: when dumping regs, show the stack, not thread_info We currently show: task: ti: task.ti: " "ti" and "task.ti" are redundant, and neither is actually what we want to show, which the the base of the thread stack. Change the display to show the stack pointer explicitly. Link: http://lkml.kernel.org/r/543ac5bd66ff94000a57a02e11af7239571a3055.1468523549.git.luto@kernel.org Signed-off-by: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Bug: 38331309 Change-Id: I7d4b915d38770d0c9384695b2064e4c66b22e94e (cherry picked from commit 8b70ca65616b3588ea1907e87f0df6d2530350df) Signed-off-by: Zubin Mithra --- kernel/printk/printk.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1a698158face..b4573b55b435 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3176,9 +3176,8 @@ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); - printk("%stask: %p ti: %p task.ti: %p\n", - log_lvl, current, current_thread_info(), - task_thread_info(current)); + printk("%stask: %p task.stack: %p\n", + log_lvl, current, task_stack_page(current)); } #endif From 8bc69d462ad300364c836616b249055ca7cb19e9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 13 Sep 2016 14:29:24 -0700 Subject: [PATCH 0971/3220] UPSTREAM: sched/core: Allow putting thread_info into task_struct If an arch opts in by setting CONFIG_THREAD_INFO_IN_TASK_STRUCT, then thread_info is defined as a single 'u32 flags' and is the first entry of task_struct. thread_info::task is removed (it serves no purpose if thread_info is embedded in task_struct), and thread_info::cpu gets its own slot in task_struct. This is heavily based on a patch written by Linus. Originally-from: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a0898196f0476195ca02713691a5037a14f2aac5.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar Bug: 38331309 Change-Id: I25e5a830f2ada5e74fa93661e97e5e701b1b70d2 (cherry picked from commit c65eacbe290b8141554c71b2c94489e73ade8c8d) Signed-off-by: Zubin Mithra --- include/linux/init_task.h | 9 +++++++++ include/linux/sched.h | 36 ++++++++++++++++++++++++++++++++++-- include/linux/thread_info.h | 15 +++++++++++++++ init/Kconfig | 7 +++++++ init/init_task.c | 7 +++++-- kernel/sched/sched.h | 4 ++++ 6 files changed, 74 insertions(+), 4 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9a0056499337..021b1e9ff6cd 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -15,6 +15,8 @@ #include #include +#include + #ifdef CONFIG_SMP # define INIT_PUSHABLE_TASKS(tsk) \ .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), @@ -183,12 +185,19 @@ extern struct task_group root_task_group; # define INIT_KASAN(tsk) #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK +# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk), +#else +# define INIT_TASK_TI(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) */ #define INIT_TASK(tsk) \ { \ + INIT_TASK_TI(tsk) \ .state = 0, \ .stack = init_stack, \ .usage = ATOMIC_INIT(2), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 85cf2d2f02cb..d8c1b4340283 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1518,6 +1518,13 @@ struct tlbflush_unmap_batch { }; struct task_struct { +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* + * For reasons of header soup (see current_thread_info()), this + * must be the first element of task_struct. + */ + struct thread_info thread_info; +#endif volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; atomic_t usage; @@ -1527,6 +1534,9 @@ struct task_struct { #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; +#ifdef CONFIG_THREAD_INFO_IN_TASK + unsigned int cpu; /* current CPU */ +#endif unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; @@ -2556,7 +2566,9 @@ extern void set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { +#ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; +#endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; @@ -2946,10 +2958,26 @@ static inline void threadgroup_change_end(struct task_struct *tsk) cgroup_threadgroup_change_end(tsk); } -#ifndef __HAVE_THREAD_FUNCTIONS +#ifdef CONFIG_THREAD_INFO_IN_TASK + +static inline struct thread_info *task_thread_info(struct task_struct *task) +{ + return &task->thread_info; +} +static inline void *task_stack_page(const struct task_struct *task) +{ + return task->stack; +} +#define setup_thread_stack(new,old) do { } while(0) +static inline unsigned long *end_of_stack(const struct task_struct *task) +{ + return task->stack; +} + +#elif !defined(__HAVE_THREAD_FUNCTIONS) #define task_thread_info(task) ((struct thread_info *)(task)->stack) -#define task_stack_page(task) ((task)->stack) +#define task_stack_page(task) ((void *)(task)->stack) static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) { @@ -3241,7 +3269,11 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) static inline unsigned int task_cpu(const struct task_struct *p) { +#ifdef CONFIG_THREAD_INFO_IN_TASK + return p->cpu; +#else return task_thread_info(p)->cpu; +#endif } static inline int task_node(const struct task_struct *p) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 4cf89517783a..8784cebd0f51 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -13,6 +13,21 @@ struct timespec; struct compat_timespec; +#ifdef CONFIG_THREAD_INFO_IN_TASK +struct thread_info { + u32 flags; /* low level flags */ +}; + +#define INIT_THREAD_INFO(tsk) \ +{ \ + .flags = 0, \ +} +#endif + +#ifdef CONFIG_THREAD_INFO_IN_TASK +#define current_thread_info() ((struct thread_info *)current) +#endif + /* * System call restart block. */ diff --git a/init/Kconfig b/init/Kconfig index 445af1262134..ae7995c2fce8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -26,6 +26,13 @@ config IRQ_WORK config BUILDTIME_EXTABLE_SORT bool +config THREAD_INFO_IN_TASK + bool + help + Select this to move thread_info off the stack into task_struct. To + make this work, an arch will need to remove all thread_info fields + except flags and fix any runtime bugs. + menu "General setup" config BROKEN diff --git a/init/init_task.c b/init/init_task.c index ba0a7f362d9e..11f83be1fa79 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -22,5 +22,8 @@ EXPORT_SYMBOL(init_task); * Initial thread structure. Alignment of this is handled by a special * linker map entry. */ -union thread_union init_thread_union __init_task_data = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = { +#ifndef CONFIG_THREAD_INFO_IN_TASK + INIT_THREAD_INFO(init_task) +#endif +}; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 430fff0d005d..029cf2bbeda2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1028,7 +1028,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + p->cpu = cpu; +#else task_thread_info(p)->cpu = cpu; +#endif p->wake_cpu = cpu; #endif } From 99cf9fa9a00606ee4d51d876af421362f1818160 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:43 -0700 Subject: [PATCH 0972/3220] UPSTREAM: sched/core: Add try_get_task_stack() and put_task_stack() There are a few places in the kernel that access stack memory belonging to a different task. Before we can start freeing task stacks before the task_struct is freed, we need a way for those code paths to pin the stack. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/17a434f50ad3d77000104f21666575e10a9c1fbd.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar Bug: 38331309 Change-Id: I414853e9b72ecb0967d5e1cbfc77b4929bf3f4f5 (cherry picked from commit c6c314a613cd7d03fb97713e0d642b493de42e69) Signed-off-by: Zubin Mithra --- include/linux/sched.h | 16 ++++++++++++++++ init/Kconfig | 3 +++ 2 files changed, 19 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index d8c1b4340283..0e6744bb2779 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2964,11 +2964,19 @@ static inline struct thread_info *task_thread_info(struct task_struct *task) { return &task->thread_info; } + +/* + * When accessing the stack of a non-current task that might exit, use + * try_get_task_stack() instead. task_stack_page will return a pointer + * that could get freed out from under you. + */ static inline void *task_stack_page(const struct task_struct *task) { return task->stack; } + #define setup_thread_stack(new,old) do { } while(0) + static inline unsigned long *end_of_stack(const struct task_struct *task) { return task->stack; @@ -3004,6 +3012,14 @@ static inline unsigned long *end_of_stack(struct task_struct *p) } #endif + +static inline void *try_get_task_stack(struct task_struct *tsk) +{ + return task_stack_page(tsk); +} + +static inline void put_task_stack(struct task_struct *tsk) {} + #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) diff --git a/init/Kconfig b/init/Kconfig index ae7995c2fce8..f5500e552254 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -33,6 +33,9 @@ config THREAD_INFO_IN_TASK make this work, an arch will need to remove all thread_info fields except flags and fix any runtime bugs. + One subtle change that will be needed is to use try_get_task_stack() + and put_task_stack() in save_thread_stack_tsk() and get_wchan(). + menu "General setup" config BROKEN From a960dbced9c6e34bdb1fc5f69d673352d6c4d932 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 15 Sep 2016 22:45:44 -0700 Subject: [PATCH 0973/3220] UPSTREAM: kthread: Pin the stack via try_get_task_stack()/put_task_stack() in to_live_kthread() function get_task_struct(tsk) no longer pins tsk->stack so all users of to_live_kthread() should do try_get_task_stack/put_task_stack to protect "struct kthread" which lives on kthread's stack. TODO: Kill to_live_kthread(), perhaps we can even kill "struct kthread" too, and rework kthread_stop(), it can use task_work_add() to sync with the exiting kernel thread. Message-Id: <20160629180357.GA7178@redhat.com> Signed-off-by: Oleg Nesterov Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cb9b16bbc19d4aea4507ab0552e4644c1211d130.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar Bug: 38331309 Change-Id: I2872658e56dcb1ab4173c490ef8f52affa54a404 (cherry picked from commit 23196f2e5f5d810578a772785807dcdc2b9fdce9) Signed-off-by: Zubin Mithra --- kernel/kthread.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index 698b8dec3074..d9b0be5c6a5f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -65,7 +65,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) static struct kthread *to_live_kthread(struct task_struct *k) { struct completion *vfork = ACCESS_ONCE(k->vfork_done); - if (likely(vfork)) + if (likely(vfork) && try_get_task_stack(k)) return __to_kthread(vfork); return NULL; } @@ -427,8 +427,10 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_live_kthread(k); - if (kthread) + if (kthread) { __kthread_unpark(k, kthread); + put_task_stack(k); + } } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -457,6 +459,7 @@ int kthread_park(struct task_struct *k) wait_for_completion(&kthread->parked); } } + put_task_stack(k); ret = 0; } return ret; @@ -492,6 +495,7 @@ int kthread_stop(struct task_struct *k) __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); + put_task_stack(k); } ret = k->exit_code; put_task_struct(k); From 264c551c4c77c9645a1c5a03735a71ed37348bc4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 19 Oct 2016 19:28:12 +0100 Subject: [PATCH 0974/3220] UPSTREAM: thread_info: factor out restart_block Since commit f56141e3e2d9aabf ("all arches, signal: move restart_block to struct task_struct"), thread_info and restart_block have been logically distinct, yet struct restart_block is still defined in . At least one architecture (erroneously) uses restart_block as part of its thread_info, and thus the definition of restart_block must come before the include of . Subsequent patches in this series need to shuffle the order of includes and definitions in , and will make this ordering fragile. This patch moves the definition of restart_block out to its own header. This serves as generic cleanup, logically separating thread_info and restart_block, and also makes it easier to avoid fragility. Signed-off-by: Mark Rutland Reviewed-by: Andy Lutomirski Cc: Andrew Morton Cc: Heiko Carstens Cc: Kees Cook Signed-off-by: Catalin Marinas Bug: 38331309 Change-Id: I4283c87072c092179e2b6c02cbf7248b4a1c2d22 (cherry picked from commit 53d74d056a4e306a72b8883d325b5d853c0618e6) Signed-off-by: Zubin Mithra --- include/linux/restart_block.h | 51 +++++++++++++++++++++++++++++++++++ include/linux/thread_info.h | 41 +--------------------------- 2 files changed, 52 insertions(+), 40 deletions(-) create mode 100644 include/linux/restart_block.h diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h new file mode 100644 index 000000000000..0d905d8ec553 --- /dev/null +++ b/include/linux/restart_block.h @@ -0,0 +1,51 @@ +/* + * Common syscall restarting data + */ +#ifndef __LINUX_RESTART_BLOCK_H +#define __LINUX_RESTART_BLOCK_H + +#include +#include + +struct timespec; +struct compat_timespec; +struct pollfd; + +/* + * System call restart block. + */ +struct restart_block { + long (*fn)(struct restart_block *); + union { + /* For futex_wait and futex_wait_requeue_pi */ + struct { + u32 __user *uaddr; + u32 val; + u32 flags; + u32 bitset; + u64 time; + u32 __user *uaddr2; + } futex; + /* For nanosleep */ + struct { + clockid_t clockid; + struct timespec __user *rmtp; +#ifdef CONFIG_COMPAT + struct compat_timespec __user *compat_rmtp; +#endif + u64 expires; + } nanosleep; + /* For poll */ + struct { + struct pollfd __user *ufds; + int nfds; + int has_timeout; + unsigned long tv_sec; + unsigned long tv_nsec; + } poll; + }; +}; + +extern long do_no_restart_syscall(struct restart_block *parm); + +#endif /* __LINUX_RESTART_BLOCK_H */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 8784cebd0f51..e8369b0d71e1 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -9,9 +9,7 @@ #include #include - -struct timespec; -struct compat_timespec; +#include #ifdef CONFIG_THREAD_INFO_IN_TASK struct thread_info { @@ -28,43 +26,6 @@ struct thread_info { #define current_thread_info() ((struct thread_info *)current) #endif -/* - * System call restart block. - */ -struct restart_block { - long (*fn)(struct restart_block *); - union { - /* For futex_wait and futex_wait_requeue_pi */ - struct { - u32 __user *uaddr; - u32 val; - u32 flags; - u32 bitset; - u64 time; - u32 __user *uaddr2; - } futex; - /* For nanosleep */ - struct { - clockid_t clockid; - struct timespec __user *rmtp; -#ifdef CONFIG_COMPAT - struct compat_timespec __user *compat_rmtp; -#endif - u64 expires; - } nanosleep; - /* For poll */ - struct { - struct pollfd __user *ufds; - int nfds; - int has_timeout; - unsigned long tv_sec; - unsigned long tv_nsec; - } poll; - }; -}; - -extern long do_no_restart_syscall(struct restart_block *parm); - #include #include From f2b8210f0a7c3f717b82880a1160aaa9255ceecf Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 19 Oct 2016 19:28:13 +0100 Subject: [PATCH 0975/3220] UPSTREAM: thread_info: include for THREAD_INFO_IN_TASK When CONFIG_THREAD_INFO_IN_TASK is selected, the current_thread_info() macro relies on current having been defined prior to its use. However, not all users of current_thread_info() include , and thus current is not guaranteed to be defined. When CONFIG_THREAD_INFO_IN_TASK is not selected, it's possible that get_current() / current are based upon current_thread_info(), and includes . Thus always including would result in circular dependences on some platforms. To ensure both cases work, this patch includes , but only when CONFIG_THREAD_INFO_IN_TASK is selected. Signed-off-by: Mark Rutland Acked-by: Heiko Carstens Reviewed-by: Andy Lutomirski Cc: Andrew Morton Cc: Kees Cook Signed-off-by: Catalin Marinas Bug: 38331309 Change-Id: Ia981a829798d60a54d4e3eb679d8e24b01228357 (cherry picked from commit dc3d2a679cd8631b8a570fc8ca5f4712d7d25698) Signed-off-by: Zubin Mithra --- include/linux/thread_info.h | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e8369b0d71e1..8933ecc2bc9f 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -12,17 +12,12 @@ #include #ifdef CONFIG_THREAD_INFO_IN_TASK -struct thread_info { - u32 flags; /* low level flags */ -}; - -#define INIT_THREAD_INFO(tsk) \ -{ \ - .flags = 0, \ -} -#endif - -#ifdef CONFIG_THREAD_INFO_IN_TASK +/* + * For CONFIG_THREAD_INFO_IN_TASK kernels we need for the + * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels, + * including can cause a circular dependency on some platforms. + */ +#include #define current_thread_info() ((struct thread_info *)current) #endif From b4674788d072c3e2036dedbab83d4abd233cc41c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:03 +0000 Subject: [PATCH 0976/3220] UPSTREAM: arm64: thread_info remove stale items We have a comment claiming __switch_to() cares about where cpu_context is located relative to cpu_domain in thread_info. However arm64 has never had a thread_info::cpu_domain field, and neither __switch_to nor cpu_switch_to care where the cpu_context field is relative to others. Additionally, the init_thread_info alias is never used anywhere in the kernel, and will shortly become problematic when thread_info is moved into task_struct. This patch removes both. Signed-off-by: Mark Rutland Tested-by: Laura Abbott Cc: James Morse Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 38331309 Change-Id: Ia4769ddcc6fc556e9eb6193d64fc99fe2d9e39ab (cherry picked from commit dcbe02855f048fdf1e13ebc697e83c8d297f9f5a) Signed-off-by: Zubin Mithra --- arch/arm64/include/asm/thread_info.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 794d22603f04..9224bf2d59d5 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -42,7 +42,6 @@ typedef unsigned long mm_segment_t; /* * low level task data that entry.S needs immediate access to. - * __switch_to() assumes cpu_context follows immediately after cpu_domain. */ struct thread_info { unsigned long flags; /* low level flags */ @@ -63,7 +62,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) /* From d6d869da8de519c04a0c596af796d29f05f79ccc Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:04 +0000 Subject: [PATCH 0977/3220] BACKPORT: arm64: asm-offsets: remove unused definitions Subsequent patches will move the thread_info::{task,cpu} fields, and the current TI_{TASK,CPU} offset definitions are not used anywhere. This patch removes the redundant definitions. Signed-off-by: Mark Rutland Tested-by: Laura Abbott Cc: James Morse Cc: Will Deacon Signed-off-by: Catalin Marinas This is a modification of Mark Rutland's original patch. Guards to check if CONFIG_THREAD_INFO_IN_TASK is used has been inserted. Bug: 38331309 Change-Id: I95903e0f862fc5dcf89e51926afa22389f2f7cee (cherry picked from commit 3fe12da4c7fa6491e0fb7c5371716ac7f8ea80a5) Signed-off-by: Zubin Mithra --- arch/arm64/kernel/asm-offsets.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index b84d8e85d19d..66357a43c097 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -36,8 +36,10 @@ int main(void) DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); +#ifndef CONFIG_THREAD_INFO_IN_TASK DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); +#endif #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct thread_info, ttbr0)); #endif From 725d3aa599931ab3834d379c90a07056e62fa179 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:05 +0000 Subject: [PATCH 0978/3220] UPSTREAM: arm64: factor out current_stack_pointer We define current_stack_pointer in , though other files and header relying upon it do not have this necessary include, and are thus fragile to changes in the header soup. Subsequent patches will affect the header soup such that directly including may result in a circular header include in some of these cases, so we can't simply include . Instead, factor current_thread_info into its own header, and have all existing users include this explicitly. Signed-off-by: Mark Rutland Tested-by: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 38331309 Change-Id: I4d6bc27bef686d0dade1d6abe1ce947cf6c4dfb3 (cherry picked from commit a9ea0017ebe8889dfa136cac2aa7ae0ee6915e1f) Signed-off-by: Zubin Mithra --- arch/arm64/include/asm/percpu.h | 2 ++ arch/arm64/include/asm/perf_event.h | 2 ++ arch/arm64/include/asm/stack_pointer.h | 9 +++++++++ arch/arm64/include/asm/thread_info.h | 6 +----- arch/arm64/kernel/return_address.c | 1 + arch/arm64/kernel/stacktrace.c | 1 + arch/arm64/kernel/traps.c | 1 + 7 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 arch/arm64/include/asm/stack_pointer.h diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 8a336852eeba..2ce1a0262a59 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -16,6 +16,8 @@ #ifndef __ASM_PERCPU_H #define __ASM_PERCPU_H +#include + static inline void set_my_cpu_offset(unsigned long off) { asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory"); diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h index 7bd3cdb533ea..91b6be092ce2 100644 --- a/arch/arm64/include/asm/perf_event.h +++ b/arch/arm64/include/asm/perf_event.h @@ -17,6 +17,8 @@ #ifndef __ASM_PERF_EVENT_H #define __ASM_PERF_EVENT_H +#include + #ifdef CONFIG_PERF_EVENTS struct pt_regs; extern unsigned long perf_instruction_pointer(struct pt_regs *regs); diff --git a/arch/arm64/include/asm/stack_pointer.h b/arch/arm64/include/asm/stack_pointer.h new file mode 100644 index 000000000000..ffcdf742cddf --- /dev/null +++ b/arch/arm64/include/asm/stack_pointer.h @@ -0,0 +1,9 @@ +#ifndef __ASM_STACK_POINTER_H +#define __ASM_STACK_POINTER_H + +/* + * how to get the current stack pointer from C + */ +register unsigned long current_stack_pointer asm ("sp"); + +#endif /* __ASM_STACK_POINTER_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 9224bf2d59d5..41e4fd78e345 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -36,6 +36,7 @@ struct task_struct; +#include #include typedef unsigned long mm_segment_t; @@ -64,11 +65,6 @@ struct thread_info { #define init_stack (init_thread_union.stack) -/* - * how to get the current stack pointer from C - */ -register unsigned long current_stack_pointer asm ("sp"); - /* * how to get the thread information struct from C */ diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 1718706fde83..12a87f2600f2 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -12,6 +12,7 @@ #include #include +#include #include struct return_address_data { diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index cfd46c227c8c..d4606014b48a 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -22,6 +22,7 @@ #include #include +#include #include /* diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index f5c82c76cf7c..34904e1aec31 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include From 62c30ed34e4fa9fa237276e90d089057d7332dc1 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:06 +0000 Subject: [PATCH 0979/3220] UPSTREAM: arm64: traps: simplify die() and __die() In arm64's die and __die routines we pass around a thread_info, and subsequently use this to determine the relevant task_struct, and the end of the thread's stack. Subsequent patches will decouple thread_info from the stack, and this approach will no longer work. To figure out the end of the stack, we can use the new generic end_of_stack() helper. As we only call __die() from die(), and die() always deals with the current task, we can remove the parameter and have both acquire current directly, which also makes it clear that __die can't be called for arbitrary tasks. Signed-off-by: Mark Rutland Tested-by: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 38331309 Change-Id: Ie1a96a0a8e244d458a7f147001b64216403e07c4 (cherry picked from commit 876e7a38e8788773aac768091aaa3b42e470c03b) Signed-off-by: Zubin Mithra --- arch/arm64/kernel/traps.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 34904e1aec31..3be84e579a8c 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -235,10 +235,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) #endif #define S_SMP " SMP" -static int __die(const char *str, int err, struct thread_info *thread, - struct pt_regs *regs) +static int __die(const char *str, int err, struct pt_regs *regs) { - struct task_struct *tsk = thread->task; + struct task_struct *tsk = current; static int die_counter; int ret; @@ -253,7 +252,8 @@ static int __die(const char *str, int err, struct thread_info *thread, print_modules(); __show_regs(regs); pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", - TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1); + TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), + end_of_stack(tsk)); if (!user_mode(regs) || in_interrupt()) { dump_mem(KERN_EMERG, "Stack: ", regs->sp, @@ -273,7 +273,6 @@ static DEFINE_RAW_SPINLOCK(die_lock); */ void die(const char *str, struct pt_regs *regs, int err) { - struct thread_info *thread = current_thread_info(); int ret; oops_enter(); @@ -281,9 +280,9 @@ void die(const char *str, struct pt_regs *regs, int err) raw_spin_lock_irq(&die_lock); console_verbose(); bust_spinlocks(1); - ret = __die(str, err, thread, regs); + ret = __die(str, err, regs); - if (regs && kexec_should_crash(thread->task)) + if (regs && kexec_should_crash(current)) crash_kexec(regs); bust_spinlocks(0); From f00a4a09f4c3e5ac2580d8588a01f473fea0e554 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:07 +0000 Subject: [PATCH 0980/3220] UPSTREAM: arm64: unexport walk_stackframe The walk_stackframe functions is architecture-specific, with a varying prototype, and common code should not use it directly. None of its current users can be built as modules. With THREAD_INFO_IN_TASK, users will also need to hold a stack reference before calling it. There's no reason for it to be exported, and it's very easy to misuse, so unexport it for now. Signed-off-by: Mark Rutland Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 38331309 Change-Id: Ibe0dca36cc7d35f92c6bc13b373755d82f0eb9ef (cherry picked from commit 2020a5ae7c8c2c8504565004915017507b135c63) Signed-off-by: Zubin Mithra --- arch/arm64/kernel/stacktrace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index d4606014b48a..191cc6cd3b59 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -126,7 +126,6 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, break; } } -EXPORT_SYMBOL(walk_stackframe); #ifdef CONFIG_STACKTRACE struct stack_trace_data { From 0f9f933796581345224426a06d58edf1bbd5a26b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:08 +0000 Subject: [PATCH 0981/3220] UPSTREAM: arm64: prep stack walkers for THREAD_INFO_IN_TASK When CONFIG_THREAD_INFO_IN_TASK is selected, task stacks may be freed before a task is destroyed. To account for this, the stacks are refcounted, and when manipulating the stack of another task, it is necessary to get/put the stack to ensure it isn't freed and/or re-used while we do so. This patch reworks the arm64 stack walking code to account for this. When CONFIG_THREAD_INFO_IN_TASK is not selected these perform no refcounting, and this should only be a structural change that does not affect behaviour. Signed-off-by: Mark Rutland Tested-by: Laura Abbott Cc: AKASHI Takahiro Cc: Andy Lutomirski Cc: James Morse Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 38331309 Change-Id: I89c4f53c4fea0d0be2f88221489c0c7f43366810 (cherry picked from commit 9bbd4c56b0b642f04396da378296e68096d5afca) Signed-off-by: Zubin Mithra --- arch/arm64/kernel/process.c | 20 ++++++++++++++------ arch/arm64/kernel/stacktrace.c | 5 +++++ arch/arm64/kernel/traps.c | 10 ++++++++++ 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index e6afea67f6c1..75dac2c0d437 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -419,27 +419,35 @@ struct task_struct *__switch_to(struct task_struct *prev, unsigned long get_wchan(struct task_struct *p) { struct stackframe frame; - unsigned long stack_page; + unsigned long stack_page, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; + stack_page = (unsigned long)try_get_task_stack(p); + if (!stack_page) + return 0; + frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = p->curr_ret_stack; #endif - stack_page = (unsigned long)task_stack_page(p); do { if (frame.sp < stack_page || frame.sp >= stack_page + THREAD_SIZE || unwind_frame(p, &frame)) - return 0; - if (!in_sched_functions(frame.pc)) - return frame.pc; + goto out; + if (!in_sched_functions(frame.pc)) { + ret = frame.pc; + goto out; + } } while (count ++ < 16); - return 0; + +out: + put_task_stack(p); + return ret; } unsigned long arch_align_stack(unsigned long sp) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 191cc6cd3b59..4faaf1af88fd 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -157,6 +157,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) struct stack_trace_data data; struct stackframe frame; + if (!try_get_task_stack(tsk)) + return; + data.trace = trace; data.skip = trace->skip; @@ -178,6 +181,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; + + put_task_stack(tsk); } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 3be84e579a8c..e63708dc7b86 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -150,6 +150,14 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) unsigned long irq_stack_ptr; int skip; + pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); + + if (!tsk) + tsk = current; + + if (!try_get_task_stack(tsk)) + return; + /* * Switching between stacks is valid when tracing current and in * non-preemptible context. @@ -220,6 +228,8 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) stack + sizeof(struct pt_regs), false); } } + + put_task_stack(tsk); } void show_stack(struct task_struct *tsk, unsigned long *sp) From 48dd80cb1343a0e535cc6065f078dfde9b60f5ba Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:09 +0000 Subject: [PATCH 0982/3220] BACKPORT: arm64: move sp_el0 and tpidr_el1 into cpu_suspend_ctx When returning from idle, we rely on the fact that thread_info lives at the end of the kernel stack, and restore this by masking the saved stack pointer. Subsequent patches will sever the relationship between the stack and thread_info, and to cater for this we must save/restore sp_el0 explicitly, storing it in cpu_suspend_ctx. As cpu_suspend_ctx must be doubleword aligned, this leaves us with an extra slot in cpu_suspend_ctx. We can use this to save/restore tpidr_el1 in the same way, which simplifies the code, avoiding pointer chasing on the restore path (as we no longer need to load thread_info::cpu followed by the relevant slot in __per_cpu_offset based on this). This patch stashes both registers in cpu_suspend_ctx. Signed-off-by: Mark Rutland Tested-by: Laura Abbott Cc: James Morse Cc: Lorenzo Pieralisi Cc: Will Deacon Signed-off-by: Catalin Marinas This is a modification of Mark Rutland's original patch. The differences from the original patch are as follows :- - NR_CTX_REGS is set to 13 instead of 12 - x13 and x14 are used as temporary registers to hold sp_el0 and tpidr_el1 instead of x11 and x12. - The values are temporarily stashed at offset 88 and 96 of cpu_suspend_ctx instead of 80 and 88. The original patch would not apply cleanly and these changes were made to resolve this. Bug: 38331309 Change-Id: I4e72aebd51e99d3767487383c14a1ba784312bf1 (cherry picked from commit 623b476fc815464a0241ea7483da7b3580b7d8ac) Signed-off-by: Zubin Mithra --- arch/arm64/include/asm/suspend.h | 2 +- arch/arm64/kernel/sleep.S | 3 --- arch/arm64/kernel/suspend.c | 6 ------ arch/arm64/mm/proc.S | 10 ++++++++-- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h index 59a5b0f1e81c..4d19a03d316e 100644 --- a/arch/arm64/include/asm/suspend.h +++ b/arch/arm64/include/asm/suspend.h @@ -1,7 +1,7 @@ #ifndef __ASM_SUSPEND_H #define __ASM_SUSPEND_H -#define NR_CTX_REGS 11 +#define NR_CTX_REGS 13 /* * struct cpu_suspend_ctx must be 16-byte aligned since it is allocated on diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index e33fe33876ab..f586f7c875e2 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -173,9 +173,6 @@ ENTRY(cpu_resume) /* load physical address of identity map page table in x1 */ adrp x1, idmap_pg_dir mov sp, x2 - /* save thread_info */ - and x2, x2, #~(THREAD_SIZE - 1) - msr sp_el0, x2 /* * cpu_do_resume expects x0 to contain context physical address * pointer and x1 to contain physical address of 1:1 page tables diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index f42b8b8f1d0a..e7a96462ca2d 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -95,12 +95,6 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) */ cpu_uninstall_idmap(); - /* - * Restore per-cpu offset before any kernel - * subsystem relying on it has a chance to run. - */ - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - /* * PSTATE was not saved over suspend/resume, re-enable any * detected features that might not have been set correctly. diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 85a542b21575..3b3a4710dcd6 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -71,12 +71,15 @@ ENTRY(cpu_do_suspend) mrs x10, mdscr_el1 mrs x11, oslsr_el1 mrs x12, sctlr_el1 + mrs x13, tpidr_el1 + mrs x14, sp_el0 stp x2, x3, [x0] stp x4, x5, [x0, #16] stp x6, x7, [x0, #32] stp x8, x9, [x0, #48] stp x10, x11, [x0, #64] - str x12, [x0, #80] + stp x12, x13, [x0, #80] + str x14, [x0, #96] ret ENDPROC(cpu_do_suspend) @@ -99,7 +102,8 @@ ENTRY(cpu_do_resume) ldp x6, x7, [x0, #32] ldp x8, x9, [x0, #48] ldp x10, x11, [x0, #64] - ldr x12, [x0, #80] + ldp x12, x13, [x0, #80] + ldr x14, [x0, #96] msr tpidr_el0, x2 msr tpidrro_el0, x3 msr contextidr_el1, x4 @@ -111,6 +115,8 @@ ENTRY(cpu_do_resume) msr tcr_el1, x8 msr vbar_el1, x9 msr mdscr_el1, x10 + msr tpidr_el1, x13 + msr sp_el0, x14 /* * Restore oslsr_el1 by writing oslar_el1 */ From d8cd9de39f1d1c16354e3342f27366ab0203474f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:10 +0000 Subject: [PATCH 0983/3220] UPSTREAM: arm64: smp: prepare for smp_processor_id() rework Subsequent patches will make smp_processor_id() use a percpu variable. This will make smp_processor_id() dependent on the percpu offset, and thus we cannot use smp_processor_id() to figure out what to initialise the offset to. Prepare for this by initialising the percpu offset based on current::cpu, which will work regardless of how smp_processor_id() is implemented. Also, make this relationship obvious by placing this code together at the start of secondary_start_kernel(). Signed-off-by: Mark Rutland Tested-by: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 38331309 Change-Id: I43304d06602216fbb5b968ff83e0face11e238f5 (cherry picked from commit 580efaa7ccfb8c0790dce4396434f0e5ac8d86ee) Signed-off-by: Zubin Mithra --- arch/arm64/kernel/smp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index a84623d91410..8526ff64e4b2 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -135,7 +135,10 @@ static void smp_store_cpu_info(unsigned int cpuid) asmlinkage void secondary_start_kernel(void) { struct mm_struct *mm = &init_mm; - unsigned int cpu = smp_processor_id(); + unsigned int cpu; + + cpu = task_cpu(current); + set_my_cpu_offset(per_cpu_offset(cpu)); /* * All kernel threads share the same mm context; grab a @@ -144,8 +147,6 @@ asmlinkage void secondary_start_kernel(void) atomic_inc(&mm->mm_count); current->active_mm = mm; - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. From f7f69dfb0277528bfcad3c08558dc7d60bc906dd Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:11 +0000 Subject: [PATCH 0984/3220] UPSTREAM: arm64: make cpu number a percpu variable In the absence of CONFIG_THREAD_INFO_IN_TASK, core code maintains thread_info::cpu, and low-level architecture code can access this to build raw_smp_processor_id(). With CONFIG_THREAD_INFO_IN_TASK, core code maintains task_struct::cpu, which for reasons of hte header soup is not accessible to low-level arch code. Instead, we can maintain a percpu variable containing the cpu number. For both the old and new implementation of raw_smp_processor_id(), we read a syreg into a GPR, add an offset, and load the result. As the offset is now larger, it may not be folded into the load, but otherwise the assembly shouldn't change much. Signed-off-by: Mark Rutland Tested-by: Laura Abbott Cc: James Morse Cc: Suzuki K Poulose Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 38331309 Change-Id: I154927b0f9fc0ebbbed88c9958408bbb19cf09de (cherry picked from commit 57c82954e77fa12c1023e87210d2ede77aaa0058) Signed-off-by: Zubin Mithra --- arch/arm64/include/asm/smp.h | 11 ++++++++++- arch/arm64/kernel/smp.c | 5 +++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 2013a4dc5124..d7e851c5bc42 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -16,11 +16,20 @@ #ifndef __ASM_SMP_H #define __ASM_SMP_H +#include + #include #include #include -#define raw_smp_processor_id() (current_thread_info()->cpu) +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); + +/* + * We don't use this_cpu_read(cpu_number) as that has implicit writes to + * preempt_count, and associated (compiler) barriers, that we'd like to avoid + * the expense of. If we're preemptible, the value can be stale at use anyway. + */ +#define raw_smp_processor_id() (*this_cpu_ptr(&cpu_number)) struct seq_file; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 8526ff64e4b2..0b66134345f7 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -57,6 +57,9 @@ #define CREATE_TRACE_POINTS #include +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@ -608,6 +611,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) if (max_cpus == 0) break; + per_cpu(cpu_number, cpu) = cpu; + if (cpu == smp_processor_id()) continue; From 1cdfc007f328200a950b65f8ddd69b41cd2fb8fc Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:12 +0000 Subject: [PATCH 0985/3220] UPSTREAM: arm64: assembler: introduce ldr_this_cpu Shortly we will want to load a percpu variable in the return from userspace path. We can save an instruction by folding the addition of the percpu offset into the load instruction, and this patch adds a new helper to do so. At the same time, we clean up this_cpu_ptr for consistency. As with {adr,ldr,str}_l, we change the template to take the destination register first, and name this dst. Secondly, we rename the macro to adr_this_cpu, following the scheme of adr_l, and matching the newly added ldr_this_cpu. Signed-off-by: Mark Rutland Tested-by: Laura Abbott Cc: Ard Biesheuvel Cc: James Morse Cc: Will Deacon Signed-off-by: Catalin Marinas Bug: 38331309 Change-Id: Iaaf4ea9674ab89289badee216b5305204172895e (cherry picked from commit 1b7e2296a822dfd2349960addc42a139360ce769) Signed-off-by: Zubin Mithra --- arch/arm64/include/asm/assembler.h | 19 +++++++++++++++---- arch/arm64/kernel/entry.S | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index d8855ca6068a..e450bb6d21bd 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -223,14 +223,25 @@ lr .req x30 // link register .endm /* + * @dst: Result of per_cpu(sym, smp_processor_id()) * @sym: The name of the per-cpu variable - * @reg: Result of per_cpu(sym, smp_processor_id()) * @tmp: scratch register */ - .macro this_cpu_ptr, sym, reg, tmp - adr_l \reg, \sym + .macro adr_this_cpu, dst, sym, tmp + adr_l \dst, \sym mrs \tmp, tpidr_el1 - add \reg, \reg, \tmp + add \dst, \dst, \tmp + .endm + + /* + * @dst: Result of READ_ONCE(per_cpu(sym, smp_processor_id())) + * @sym: The name of the per-cpu variable + * @tmp: scratch register + */ + .macro ldr_this_cpu dst, sym, tmp + adr_l \dst, \sym + mrs \tmp, tpidr_el1 + ldr \dst, [\dst, \tmp] .endm /* diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 3f9d78612e57..ed2df0570cc8 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -266,7 +266,7 @@ alternative_endif cmp x25, tsk b.ne 9998f - this_cpu_ptr irq_stack, x25, x26 + adr_this_cpu x25, irq_stack, x26 mov x26, #IRQ_STACK_START_SP add x26, x25, x26 From 4ca3c2cf00be2fa1cd9cc576dd53adef96640701 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 Nov 2016 20:23:13 +0000 Subject: [PATCH 0986/3220] BACKPORT: arm64: split thread_info from task stack This patch moves arm64's struct thread_info from the task stack into task_struct. This protects thread_info from corruption in the case of stack overflows, and makes its address harder to determine if stack addresses are leaked, making a number of attacks more difficult. Precise detection and handling of overflow is left for subsequent patches. Largely, this involves changing code to store the task_struct in sp_el0, and acquire the thread_info from the task struct. Core code now implements current_thread_info(), and as noted in this relies on offsetof(task_struct, thread_info) == 0, enforced by core code. This change means that the 'tsk' register used in entry.S now points to a task_struct, rather than a thread_info as it used to. To make this clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets appropriately updated to account for the structural change. Userspace clobbers sp_el0, and we can no longer restore this from the stack. Instead, the current task is cached in a per-cpu variable that we can safely access from early assembly as interrupts are disabled (and we are thus not preemptible). Both secondary entry and idle are updated to stash the sp and task pointer separately. Signed-off-by: Mark Rutland Tested-by: Laura Abbott Cc: AKASHI Takahiro Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: James Morse Cc: Kees Cook Cc: Suzuki K Poulose Cc: Will Deacon Signed-off-by: Catalin Marinas This is a modification of Mark Rutland's original patch. Guards to check if CONFIG_THREAD_INFO_IN_TASK is used has been inserted. get_current() for when CONFIG_THREAD_INFO_IN_TASK is not used has been added to arch/arm64/include/asm/current.h. Bug: 38331309 Change-Id: Ic5eae344a7c2baea0864f6ae16be1e9c60c0a74a (cherry picked from commit c02433dd6de32f042cf3ffe476746b1115b8c096) Signed-off-by: Zubin Mithra --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/current.h | 27 +++++++++++++ arch/arm64/include/asm/smp.h | 3 ++ arch/arm64/include/asm/thread_info.h | 16 +++++++- arch/arm64/kernel/asm-offsets.c | 14 +++++-- arch/arm64/kernel/entry.S | 58 ++++++++++++++++++++++++++-- arch/arm64/kernel/head.S | 18 ++++++++- arch/arm64/kernel/process.c | 22 +++++++++++ arch/arm64/kernel/smp.c | 6 +++ 9 files changed, 156 insertions(+), 9 deletions(-) create mode 100644 arch/arm64/include/asm/current.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 35be8566140e..da007c26de52 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -98,6 +98,7 @@ config ARM64 select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select HAVE_CONTEXT_TRACKING + select THREAD_INFO_IN_TASK help ARM 64-bit (AArch64) Linux support. diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h new file mode 100644 index 000000000000..2e61d21294ba --- /dev/null +++ b/arch/arm64/include/asm/current.h @@ -0,0 +1,27 @@ +#ifndef __ASM_CURRENT_H +#define __ASM_CURRENT_H + +#include + +#include + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_THREAD_INFO_IN_TASK +struct task_struct; + +static __always_inline struct task_struct *get_current(void) +{ + return (struct task_struct *)read_sysreg(sp_el0); +} +#define current get_current() +#else +#include +#define get_current() (current_thread_info()->task) +#define current get_current() +#endif + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_CURRENT_H */ + diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index d7e851c5bc42..a05033beb2a2 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -66,6 +66,9 @@ asmlinkage void secondary_start_kernel(void); */ struct secondary_data { void *stack; +#ifdef CONFIG_THREAD_INFO_IN_TASK + struct task_struct *task; +#endif }; extern struct secondary_data secondary_data; extern void secondary_entry(void); diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 41e4fd78e345..ec4f8c04aeda 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -47,14 +47,25 @@ typedef unsigned long mm_segment_t; struct thread_info { unsigned long flags; /* low level flags */ mm_segment_t addr_limit; /* address limit */ +#ifndef CONFIG_THREAD_INFO_IN_TASK struct task_struct *task; /* main task structure */ +#endif #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ #endif int preempt_count; /* 0 => preemptable, <0 => bug */ +#ifndef CONFIG_THREAD_INFO_IN_TASK int cpu; /* cpu */ +#endif }; +#ifdef CONFIG_THREAD_INFO_IN_TASK +#define INIT_THREAD_INFO(tsk) \ +{ \ + .preempt_count = INIT_PREEMPT_COUNT, \ + .addr_limit = KERNEL_DS, \ +} +#else #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ @@ -63,8 +74,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_stack (init_thread_union.stack) - /* * how to get the thread information struct from C */ @@ -81,6 +90,9 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)sp_el0; } +#endif + +#define init_stack (init_thread_union.stack) #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 66357a43c097..24e65f0897ee 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -33,12 +33,15 @@ int main(void) { DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); BLANK(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); + DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); + DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); +#else DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); -#ifndef CONFIG_THREAD_INFO_IN_TASK - DEFINE(TI_TASK, offsetof(struct thread_info, task)); - DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); #endif #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct thread_info, ttbr0)); @@ -113,6 +116,11 @@ int main(void) DEFINE(TZ_MINWEST, offsetof(struct timezone, tz_minuteswest)); DEFINE(TZ_DSTTIME, offsetof(struct timezone, tz_dsttime)); BLANK(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); + DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); + BLANK(); +#endif #ifdef CONFIG_KVM_ARM_HOST DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index ed2df0570cc8..dba3aceaed2f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -93,9 +93,14 @@ .if \el == 0 mrs x21, sp_el0 +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear, + ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug +#else mov tsk, sp and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug +#endif disable_step_tsk x19, x20 // exceptions when scheduling. mov x29, xzr // fp pointed to user-space @@ -103,10 +108,18 @@ add x21, sp, #S_FRAME_SIZE get_thread_info tsk /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] +#else ldr x20, [tsk, #TI_ADDR_LIMIT] +#endif str x20, [sp, #S_ORIG_ADDR_LIMIT] mov x20, #TASK_SIZE_64 +#ifdef CONFIG_THREAD_INFO_IN_TASK + str x20, [tsk, #TSK_TI_ADDR_LIMIT] +#else str x20, [tsk, #TI_ADDR_LIMIT] +#endif ALTERNATIVE(nop, SET_PSTATE_UAO(0), ARM64_HAS_UAO, CONFIG_ARM64_UAO) .endif /* \el == 0 */ mrs x22, elr_el1 @@ -168,7 +181,11 @@ alternative_else_nop_endif .if \el != 0 /* Restore the task's original addr_limit. */ ldr x20, [sp, #S_ORIG_ADDR_LIMIT] +#ifdef CONFIG_THREAD_INFO_IN_TASK + str x20, [tsk, #TSK_TI_ADDR_LIMIT] +#else str x20, [tsk, #TI_ADDR_LIMIT] +#endif /* No need to restore UAO, it will be restored from SPSR_EL1 */ .endif @@ -258,13 +275,20 @@ alternative_endif mov x19, sp // preserve the original sp /* - * Compare sp with the current thread_info, if the top - * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and - * should switch to the irq stack. + * Compare sp with the base of the task stack. + * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack, + * and should switch to the irq stack. */ +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x25, [tsk, TSK_STACK] + eor x25, x25, x19 + and x25, x25, #~(THREAD_SIZE - 1) + cbnz x25, 9998f +#else and x25, x19, #~(THREAD_SIZE - 1) cmp x25, tsk b.ne 9998f +#endif adr_this_cpu x25, irq_stack, x26 mov x26, #IRQ_STACK_START_SP @@ -493,9 +517,17 @@ el1_irq: irq_handler #ifdef CONFIG_PREEMPT +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count +#else ldr w24, [tsk, #TI_PREEMPT] // get preempt count +#endif cbnz w24, 1f // preempt count != 0 +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x0, [tsk, #TSK_TI_FLAGS] // get flags +#else ldr x0, [tsk, #TI_FLAGS] // get flags +#endif tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling? bl el1_preempt 1: @@ -510,7 +542,11 @@ ENDPROC(el1_irq) el1_preempt: mov x24, lr 1: bl preempt_schedule_irq // irq en/disable is done inside +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS +#else ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS +#endif tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? ret x24 #endif @@ -730,8 +766,12 @@ ENTRY(cpu_switch_to) ldp x29, x9, [x8], #16 ldr lr, [x8] mov sp, x9 +#ifdef CONFIG_THREAD_INFO_IN_TASK + msr sp_el0, x1 +#else and x9, x9, #~(THREAD_SIZE - 1) msr sp_el0, x9 +#endif ret ENDPROC(cpu_switch_to) @@ -742,7 +782,11 @@ ENDPROC(cpu_switch_to) ret_fast_syscall: disable_irq // disable interrupts str x0, [sp, #S_X0] // returned x0 +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing +#else ldr x1, [tsk, #TI_FLAGS] // re-check for syscall tracing +#endif and x2, x1, #_TIF_SYSCALL_WORK cbnz x2, ret_fast_syscall_trace and x2, x1, #_TIF_WORK_MASK @@ -774,7 +818,11 @@ work_resched: */ ret_to_user: disable_irq // disable interrupts +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x1, [tsk, #TSK_TI_FLAGS] +#else ldr x1, [tsk, #TI_FLAGS] +#endif and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending enable_step_tsk x1, x2 @@ -806,7 +854,11 @@ el0_svc_naked: // compat entry point enable_dbg_and_irq ct_user_exit 1 +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks +#else ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks +#endif tst x16, #_TIF_SYSCALL_WORK b.ne __sys_trace cmp scno, sc_nr // check upper syscall limit diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 99710399aa38..16d0820fff3a 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -424,6 +424,7 @@ kernel_img_size: .set initial_sp, init_thread_union + THREAD_START_SP __mmap_switched: mov x28, lr // preserve LR + adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb @@ -474,10 +475,18 @@ __mmap_switched: dsb sy // with MMU off #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + adrp x4, init_thread_union + add sp, x4, #THREAD_SIZE + adr_l x5, init_task + msr sp_el0, x5 // Save thread_info +#else adr_l sp, initial_sp, x4 mov x4, sp and x4, x4, #~(THREAD_SIZE - 1) msr sp_el0, x4 // Save thread_info +#endif + str_l x21, __fdt_pointer, x5 // Save FDT pointer ldr_l x4, kimage_vaddr // Save the offset between @@ -689,11 +698,18 @@ ENTRY(__secondary_switched) adr_l x5, vectors msr vbar_el1, x5 isb - +#ifdef CONFIG_THREAD_INFO_IN_TASK + adr_l x0, secondary_data + ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack + mov sp, x1 + ldr x2, [x0, #CPU_BOOT_TASK] + msr sp_el0, x2 +#else ldr_l x0, secondary_data // get secondary_data.stack mov sp, x0 and x0, x0, #~(THREAD_SIZE - 1) msr sp_el0, x0 // save thread_info +#endif mov x29, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 75dac2c0d437..e34bcf3f2c35 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -45,6 +45,9 @@ #include #include #include +#ifdef CONFIG_THREAD_INFO_IN_TASK +#include +#endif #include #include @@ -390,6 +393,22 @@ void uao_thread_switch(struct task_struct *next) } } +#ifdef CONFIG_THREAD_INFO_IN_TASK +/* + * We store our current task in sp_el0, which is clobbered by userspace. Keep a + * shadow copy so that we can restore this upon entry from userspace. + * + * This is *only* for exception entry from EL0, and is not valid until we + * __switch_to() a user task. + */ +DEFINE_PER_CPU(struct task_struct *, __entry_task); + +static void entry_task_switch(struct task_struct *next) +{ + __this_cpu_write(__entry_task, next); +} +#endif + /* * Thread switching. */ @@ -402,6 +421,9 @@ struct task_struct *__switch_to(struct task_struct *prev, tls_thread_switch(next); hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); +#ifdef CONFIG_THREAD_INFO_IN_TASK + entry_task_switch(next); +#endif uao_thread_switch(next); /* diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 0b66134345f7..ac899acec6eb 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -98,6 +98,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * We need to tell the secondary core where to find its stack and the * page tables. */ +#ifdef CONFIG_THREAD_INFO_IN_TASK + secondary_data.task = idle; +#endif secondary_data.stack = task_stack_page(idle) + THREAD_START_SP; __flush_dcache_area(&secondary_data, sizeof(secondary_data)); @@ -121,6 +124,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) pr_err("CPU%u: failed to boot: %d\n", cpu, ret); } +#ifdef CONFIG_THREAD_INFO_IN_TASK + secondary_data.task = NULL; +#endif secondary_data.stack = NULL; return ret; From b1c79e32b7a8b42487f4f4fe970e5bc7d5dd48a9 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Thu, 10 Aug 2017 10:13:39 -0700 Subject: [PATCH 0987/3220] android: configs: move quota-related configs to recommended Bug: 64468882 Change-Id: Ifdd59c83ca52ecaca00ddcea6a003a2611bf8694 Signed-off-by: Steve Muckle --- android/configs/android-base.cfg | 5 ----- android/configs/android-recommended.cfg | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 28dce6c0516d..48b2cdbe8d49 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -139,11 +139,6 @@ CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y CONFIG_PROFILING=y -CONFIG_QFMT_V2=y -CONFIG_QUOTA=y -CONFIG_QUOTACTL=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -CONFIG_QUOTA_TREE=y CONFIG_RANDOMIZE_BASE=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg index eecf8d80453a..3d7e5e168940 100644 --- a/android/configs/android-recommended.cfg +++ b/android/configs/android-recommended.cfg @@ -109,6 +109,11 @@ CONFIG_POWER_SUPPLY=y CONFIG_PSTORE=y CONFIG_PSTORE_CONSOLE=y CONFIG_PSTORE_RAM=y +CONFIG_QFMT_V2=y +CONFIG_QUOTA=y +CONFIG_QUOTACTL=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +CONFIG_QUOTA_TREE=y CONFIG_SCHEDSTATS=y CONFIG_SMARTJOYPLUS_FF=y CONFIG_SND=y From 4d666b500c2de3edc65e2ab5d5fe36780c230604 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Thu, 10 Aug 2017 12:32:00 +0200 Subject: [PATCH 0988/3220] ANDROID: binder: call poll_wait() unconditionally. Because we're not guaranteed that subsequent calls to poll() will have a poll_table_struct parameter with _qproc set. When _qproc is not set, poll_wait() is a noop, and we won't be woken up correctly. Bug: 64552728 Change-Id: I5b904c9886b6b0994d1631a636f5c5e5f6327950 Test: binderLibTest stops hanging with new test Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 4cf8e05c7a03..ea718f2995d5 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3802,12 +3802,6 @@ static void binder_stat_br(struct binder_proc *proc, } } -static int binder_has_thread_work(struct binder_thread *thread) -{ - return !binder_worklist_empty(thread->proc, &thread->todo) || - thread->looper_need_return; -} - static int binder_put_node_cmd(struct binder_proc *proc, struct binder_thread *thread, void __user **ptrp, @@ -4438,12 +4432,9 @@ static unsigned int binder_poll(struct file *filp, binder_inner_proc_unlock(thread->proc); - if (binder_has_work(thread, wait_for_proc_work)) - return POLLIN; - poll_wait(filp, &thread->wait, wait); - if (binder_has_thread_work(thread)) + if (binder_has_work(thread, wait_for_proc_work)) return POLLIN; return 0; From c825eca5ec45a467e78e66425fadfb001a8199df Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Thu, 10 Aug 2017 13:50:52 +0200 Subject: [PATCH 0989/3220] ANDROID: binder: don't enqueue death notifications to thread todo. This allows userspace to request death notifications without having to worry about getting an immediate callback on the same thread; one scenario where this would be problematic is if the death recipient handler grabs a lock that was already taken earlier (eg as part of a nested transaction). Bug: 23525545 Test: binderLibTest.DeathNotificationThread passes Change-Id: I955e16306fe3110dacb9a391ffff1bf869249495 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index ea718f2995d5..061a7258779e 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3670,22 +3670,12 @@ static int binder_thread_write(struct binder_proc *proc, ref->death = death; if (ref->node->proc == NULL) { ref->death->work.type = BINDER_WORK_DEAD_BINDER; - if (thread->looper & - (BINDER_LOOPER_STATE_REGISTERED | - BINDER_LOOPER_STATE_ENTERED)) - binder_enqueue_work( - proc, - &ref->death->work, - &thread->todo); - else { - binder_inner_proc_lock(proc); - binder_enqueue_work_ilocked( - &ref->death->work, - &proc->todo); - binder_wakeup_proc_ilocked( - proc); - binder_inner_proc_unlock(proc); - } + + binder_inner_proc_lock(proc); + binder_enqueue_work_ilocked( + &ref->death->work, &proc->todo); + binder_wakeup_proc_ilocked(proc); + binder_inner_proc_unlock(proc); } } else { if (ref->death == NULL) { From 6f227409a1797b448402a3ebf7523a229b8b6cbd Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Thu, 10 Aug 2017 13:56:16 +0200 Subject: [PATCH 0990/3220] ANDROID: binder: don't queue async transactions to thread. This can cause issues with processes using the poll() interface: 1) client sends two oneway transactions 2) the second one gets queued on async_todo (because the server didn't handle the first one yet) 3) server returns from poll(), picks up the first transaction and does transaction work 4) server is done with the transaction, sends BC_FREE_BUFFER, and the second transaction gets moved to thread->todo 5) libbinder's handlePolledCommands() only handles the commands in the current data buffer, so doesn't see the new transaction 6) the server continues running and issues a new outgoing transaction. Now, it suddenly finds the incoming oneway transaction on its thread todo, and returns that to userspace. 7) userspace does not expect this to happen; it may be holding a lock while making the outgoing transaction, and if handling the incoming trasnaction requires taking the same lock, userspace will deadlock. By queueing the async transaction to the proc workqueue, we make sure it's only picked up when a thread is ready for proc work. Bug: 38201220 Bug: 63075553 Bug: 63079216 Change-Id: I84268cc112f735d7e3173793873dfdb4b268468b Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 061a7258779e..bfdd52ea0d1c 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3522,11 +3522,13 @@ static int binder_thread_write(struct binder_proc *proc, BUG_ON(buf_node->proc != proc); w = binder_dequeue_work_head_ilocked( &buf_node->async_todo); - if (!w) + if (!w) { buf_node->has_async_transaction = 0; - else + } else { binder_enqueue_work_ilocked( - w, &thread->todo); + w, &proc->todo); + binder_wakeup_proc_ilocked(proc); + } binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); From eac37ad2df7ecf81b8dba0b64e9ed52351f73d63 Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Wed, 26 Jul 2017 12:14:41 -0700 Subject: [PATCH 0991/3220] ANDROID: keychord: Fix a slab out-of-bounds read. Fix a slab out of bounds read in keychord_write(), detected by KASAN. Signed-off-by: Mohan Srinivasan Bug: 63962952 Change-Id: Iafef48b5d7283750ac0f39f5aaa767b1c3bf2004 (cherry picked from commit 913d980e07d84a843f5323acc55d185212a2abec) --- drivers/input/misc/keychord.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c index a5ea27ad0e16..f148b937b4e5 100644 --- a/drivers/input/misc/keychord.c +++ b/drivers/input/misc/keychord.c @@ -232,9 +232,11 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, { struct keychord_device *kdev = file->private_data; struct input_keychord *keychords = 0; - struct input_keychord *keychord, *next, *end; + struct input_keychord *keychord; int ret, i, key; unsigned long flags; + size_t resid = count; + size_t key_bytes; if (count < sizeof(struct input_keychord)) return -EINVAL; @@ -265,15 +267,29 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, kdev->head = kdev->tail = 0; keychord = keychords; - end = (struct input_keychord *)((char *)keychord + count); - while (keychord < end) { - next = NEXT_KEYCHORD(keychord); - if (keychord->count <= 0 || next > end) { + while (resid > 0) { + /* Is the entire keychord entry header present ? */ + if (resid < sizeof(struct input_keychord)) { + pr_err("keychord: Insufficient bytes present for header %lu\n", + resid); + goto err_unlock_return; + } + resid -= sizeof(struct input_keychord); + if (keychord->count <= 0) { pr_err("keychord: invalid keycode count %d\n", keychord->count); goto err_unlock_return; } + key_bytes = keychord->count * sizeof(keychord->keycodes[0]); + /* Do we have all the expected keycodes ? */ + if (resid < key_bytes) { + pr_err("keychord: Insufficient bytes present for keycount %lu\n", + resid); + goto err_unlock_return; + } + resid -= key_bytes; + if (keychord->version != KEYCHORD_VERSION) { pr_err("keychord: unsupported version %d\n", keychord->version); @@ -292,7 +308,7 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, } kdev->keychord_count++; - keychord = next; + keychord = NEXT_KEYCHORD(keychord); } kdev->keychords = keychords; From dd5826152c53c1ea77986c1f3d99050d538e65e3 Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Thu, 27 Jul 2017 11:30:32 -0700 Subject: [PATCH 0992/3220] Use %zu to print resid (size_t). Print resid (size_t) portably. Signed-off-by: Mohan Srinivasan Change-Id: Ic5c9dc498bfeef2be21594ec5efd45a98a3c4b4d (cherry picked from commit a1e4c795e1b6de6b34b8cbc75499d1675608c36b) --- drivers/input/misc/keychord.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c index f148b937b4e5..c5ab3ddda456 100644 --- a/drivers/input/misc/keychord.c +++ b/drivers/input/misc/keychord.c @@ -271,7 +271,7 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, while (resid > 0) { /* Is the entire keychord entry header present ? */ if (resid < sizeof(struct input_keychord)) { - pr_err("keychord: Insufficient bytes present for header %lu\n", + pr_err("keychord: Insufficient bytes present for header %zu\n", resid); goto err_unlock_return; } @@ -284,7 +284,7 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, key_bytes = keychord->count * sizeof(keychord->keycodes[0]); /* Do we have all the expected keycodes ? */ if (resid < key_bytes) { - pr_err("keychord: Insufficient bytes present for keycount %lu\n", + pr_err("keychord: Insufficient bytes present for keycount %zu\n", resid); goto err_unlock_return; } From 462acca2816ee795fde81bdd99ca6e10bfe00d27 Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Wed, 9 Aug 2017 12:16:56 -0700 Subject: [PATCH 0993/3220] ANDROID: keychord: Fix races in keychord_write. There are multiple bugs caused by threads racing in keychord_write. 1) Threads racing through this function can cause the same element to be added to a linked list twice (multiple calls to input_register_handler() for the same input_handler struct). And the races can also cause an element in a linked list that doesn't exist attempted to be removed (multiple calls to input_unregister_handler() with the same input_handler struct). 2) The races can also cause duplicate kfree's of the keychords struct. Bug: 64133562 Bug: 63974334 Change-Id: I6329a4d58c665fab5d3e96ef96391e07b4941e80 Signed-off-by: Mohan Srinivasan (cherry picked from commit 59584701f1e2ce8ce024570576b206bea6ac69cf) --- drivers/input/misc/keychord.c | 61 ++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c index c5ab3ddda456..1673e4239403 100644 --- a/drivers/input/misc/keychord.c +++ b/drivers/input/misc/keychord.c @@ -60,6 +60,10 @@ struct keychord_device { unsigned char head; unsigned char tail; __u16 buff[BUFFER_SIZE]; + /* Bit to serialize writes to this device */ +#define KEYCHORD_BUSY 0x01 + unsigned long flags; + wait_queue_head_t write_waitq; }; static int check_keychord(struct keychord_device *kdev, @@ -172,7 +176,6 @@ static int keychord_connect(struct input_handler *handler, goto err_input_open_device; pr_info("keychord: using input dev %s for fevent\n", dev->name); - return 0; err_input_open_device: @@ -224,6 +227,41 @@ static ssize_t keychord_read(struct file *file, char __user *buffer, return count; } +/* + * serializes writes on a device. can use mutex_lock_interruptible() + * for this particular use case as well - a matter of preference. + */ +static int +keychord_write_lock(struct keychord_device *kdev) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&kdev->lock, flags); + while (kdev->flags & KEYCHORD_BUSY) { + spin_unlock_irqrestore(&kdev->lock, flags); + ret = wait_event_interruptible(kdev->write_waitq, + ((kdev->flags & KEYCHORD_BUSY) == 0)); + if (ret) + return ret; + spin_lock_irqsave(&kdev->lock, flags); + } + kdev->flags |= KEYCHORD_BUSY; + spin_unlock_irqrestore(&kdev->lock, flags); + return 0; +} + +static void +keychord_write_unlock(struct keychord_device *kdev) +{ + unsigned long flags; + + spin_lock_irqsave(&kdev->lock, flags); + kdev->flags &= ~KEYCHORD_BUSY; + spin_unlock_irqrestore(&kdev->lock, flags); + wake_up_interruptible(&kdev->write_waitq); +} + /* * keychord_write is used to configure the driver */ @@ -250,6 +288,22 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, return -EFAULT; } + /* + * Serialize writes to this device to prevent various races. + * 1) writers racing here could do duplicate input_unregister_handler() + * calls, resulting in attempting to unlink a node from a list that + * does not exist. + * 2) writers racing here could do duplicate input_register_handler() calls + * below, resulting in a duplicate insertion of a node into the list. + * 3) a double kfree of keychords can occur (in the event that + * input_register_handler() fails below. + */ + ret = keychord_write_lock(kdev); + if (ret) { + kfree(keychords); + return ret; + } + /* unregister handler before changing configuration */ if (kdev->registered) { input_unregister_handler(&kdev->input_handler); @@ -318,15 +372,19 @@ static ssize_t keychord_write(struct file *file, const char __user *buffer, if (ret) { kfree(keychords); kdev->keychords = 0; + keychord_write_unlock(kdev); return ret; } kdev->registered = 1; + keychord_write_unlock(kdev); + return count; err_unlock_return: spin_unlock_irqrestore(&kdev->lock, flags); kfree(keychords); + keychord_write_unlock(kdev); return -EINVAL; } @@ -352,6 +410,7 @@ static int keychord_open(struct inode *inode, struct file *file) spin_lock_init(&kdev->lock); init_waitqueue_head(&kdev->waitq); + init_waitqueue_head(&kdev->write_waitq); kdev->input_handler.event = keychord_event; kdev->input_handler.connect = keychord_connect; From 60366263e6aeac89f9c92978d075529ef3303c0a Mon Sep 17 00:00:00 2001 From: Mohan Srinivasan Date: Wed, 9 Aug 2017 12:36:33 -0700 Subject: [PATCH 0994/3220] ANDROID: keychord: Fix for a memory leak in keychord. Fixes a steady memory leak in the keychord release code. A close of the keychord device will leak 1 keychord structure. Easily reproducible by a simple program that does an open()->write()->close() of the keychord device. Bug: 64483974 Change-Id: I1fa402c666cffb00b8cfd6379d9fe47a0989152c Signed-off-by: Mohan Srinivasan (cherry picked from commit 72a8dae2c25d0277e48672ee85b70236268add01) --- drivers/input/misc/keychord.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c index 1673e4239403..fdcc14653b64 100644 --- a/drivers/input/misc/keychord.c +++ b/drivers/input/misc/keychord.c @@ -432,6 +432,7 @@ static int keychord_release(struct inode *inode, struct file *file) if (kdev->registered) input_unregister_handler(&kdev->input_handler); + kfree(kdev->keychords); kfree(kdev); return 0; From c932c1b7730408e592a7f46229dcb325da28d4c4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jul 2016 15:48:20 -0700 Subject: [PATCH 0995/3220] UPSTREAM: kdb: use task_cpu() instead of task_thread_info()->cpu commit e558af65be65713ef2e8b2aa637c6263caeed172 upstream. We'll need this cleanup to make the cpu field in thread_info be optional. Link: http://lkml.kernel.org/r/da298328dc77ea494576c2f20a934218e758a6fa.1468523549.git.luto@kernel.org Signed-off-by: Andy Lutomirski Cc: Jason Wessel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: I0cd616f086f0eb54ed997ea153382fbf6188dba9 Signed-off-by: Amit Pundir --- include/linux/kdb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kdb.h b/include/linux/kdb.h index a19bcf9e762e..410decacff8f 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -177,7 +177,7 @@ extern int kdb_get_kbd_char(void); static inline int kdb_process_cpu(const struct task_struct *p) { - unsigned int cpu = task_thread_info(p)->cpu; + unsigned int cpu = task_cpu(p); if (cpu > num_possible_cpus()) cpu = 0; return cpu; From e991aa38ea3856f3fb8f66b30fa44c4e85b502cd Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Fri, 11 Aug 2017 14:21:53 +0530 Subject: [PATCH 0996/3220] ANDROID: arm64: fix undeclared 'init_thread_info' error init_thread_info is deprecated in favour of THREAD_INFO_IN_TASK related changes, see Change-Id: Ia4769ddcc6fc556e9eb6193d64fc99fe2d9e39ab ("UPSTREAM: arm64: thread_info remove stale items"). Use init_task.thread_info instead, to fix following build error: arch/arm64/kernel/setup.c: In function 'setup_arch': arch/arm64/kernel/setup.c:356:2: error: 'init_thread_info' undeclared (first use in this function) init_thread_info.ttbr0 = virt_to_phys(empty_zero_page); ^ Change-Id: I13bf03211f0d918d388d1436099d286c10a23e5d Fixes: Change-Id: I85a49f70e13b153b9903851edf56f6531c14e6de ("BACKPORT: arm64: Disable TTBR0_EL1 during normal kernel execution") Fixes: Change-Id: Ia4769ddcc6fc556e9eb6193d64fc99fe2d9e39ab ("UPSTREAM: arm64: thread_info remove stale items") Signed-off-by: Amit Pundir --- arch/arm64/include/asm/thread_info.h | 2 ++ arch/arm64/kernel/setup.c | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index ec4f8c04aeda..67dd228c3f17 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -90,6 +90,8 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)sp_el0; } + +#define init_thread_info (init_thread_union.thread_info) #endif #define init_stack (init_thread_union.stack) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 6591bf23422b..ede6cc373f43 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -349,11 +349,15 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* - * Make sure init_thread_info.ttbr0 always generates translation + * Make sure thread_info.ttbr0 always generates translation * faults in case uaccess_enable() is inadvertently called by the init * thread. */ - init_thread_info.ttbr0 = virt_to_phys(empty_zero_page); +#ifdef CONFIG_THREAD_INFO_IN_TASK + init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page); +#else + init_thread_info.ttbr0 = (init_thread_union.thread_info); +#endif #endif #ifdef CONFIG_VT From 286f536cbfe03c0a10ddac2fc214d805c494e4c0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Jun 2016 12:11:17 -0700 Subject: [PATCH 0997/3220] UPSTREAM: locking: avoid passing around 'thread_info' in mutex debugging code commit 6720a305df74ca30bcc10fc316881641b6ff0c80 upstream. None of the code actually wants a thread_info, it all wants a task_struct, and it's just converting back and forth between the two ("ti->task" to get the task_struct from the thread_info, and "task_thread_info(task)" to go the other way). No semantic change. Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds Change-Id: Idd7d0fe0b2b57ece9a969d178576a12f3ae90701 Signed-off-by: Amit Pundir --- kernel/locking/mutex-debug.c | 12 ++++++------ kernel/locking/mutex-debug.h | 4 ++-- kernel/locking/mutex.c | 6 +++--- kernel/locking/mutex.h | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; + task->blocked_on = waiter; } void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(waiter->task != task); + DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); + task->blocked_on = NULL; list_del_init(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..d06ae3bb46c5 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock, extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 89350f924c85..f42f83a36506 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -537,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto skip_wait; debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + debug_mutex_add_waiter(lock, &waiter, task); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -584,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_remove_waiter(lock, &waiter, task); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -605,7 +605,7 @@ skip_wait: return 0; err: - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..a68bae5e852a 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -13,7 +13,7 @@ do { spin_lock(lock); (void)(flags); } while (0) #define spin_unlock_mutex(lock, flags) \ do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ +#define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER From 107c1d57f89d1c758723eab704ca9cd44b86ebfe Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 15 Aug 2017 15:55:46 +0530 Subject: [PATCH 0998/3220] ANDROID: arm64: Fix a copy-paste error in prior init_thread_info build fix Fix a really embarrassing copy-paste error introduced in Change-Id: I13bf03211f0d918d388d1436099d286c10a23e5d to fix init_thread_info build error. Fixes: Change-Id: I13bf03211f0d918d388d1436099d286c10a23e5d ("ANDROID: arm64: fix undeclared 'init_thread_info' error") Signed-off-by: Amit Pundir --- arch/arm64/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index ede6cc373f43..4a24719026a9 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -356,7 +356,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_THREAD_INFO_IN_TASK init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page); #else - init_thread_info.ttbr0 = (init_thread_union.thread_info); + init_thread_info.ttbr0 = virt_to_phys(empty_zero_page); #endif #endif From b32ed10511d469f0e05b0ada753379f4834c2169 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 3 Jan 2017 18:27:01 +0000 Subject: [PATCH 0999/3220] UPSTREAM: arm64: restore get_current() optimisation commit 9d84fb27fa135c99c9fe3de33628774a336a70a8 upstream. Commit c02433dd6de32f04 ("arm64: split thread_info from task stack") inverted the relationship between get_current() and current_thread_info(), with sp_el0 now holding the current task_struct rather than the current thead_info. The new implementation of get_current() prevents the compiler from being able to optimize repeated calls to either, resulting in a noticeable penalty in some microbenchmarks. This patch restores the previous optimisation by implementing get_current() in the same way as our old current_thread_info(), using a non-volatile asm statement. Acked-by: Will Deacon Signed-off-by: Mark Rutland Reported-by: Davidlohr Bueso Signed-off-by: Catalin Marinas Signed-off-by: Amit Pundir --- arch/arm64/include/asm/current.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h index 2e61d21294ba..483a6c9d3e10 100644 --- a/arch/arm64/include/asm/current.h +++ b/arch/arm64/include/asm/current.h @@ -10,9 +10,17 @@ #ifdef CONFIG_THREAD_INFO_IN_TASK struct task_struct; +/* + * We don't use read_sysreg() as we want the compiler to cache the value where + * possible. + */ static __always_inline struct task_struct *get_current(void) { - return (struct task_struct *)read_sysreg(sp_el0); + unsigned long sp_el0; + + asm ("mrs %0, sp_el0" : "=r" (sp_el0)); + + return (struct task_struct *)sp_el0; } #define current get_current() #else From 39a15ad2d7b90450f2defb57518dc8518fe43e66 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 1 Dec 2016 15:55:13 +0000 Subject: [PATCH 1000/3220] UPSTREAM: arm64: smp: Prevent raw_smp_processor_id() recursion Under CONFIG_DEBUG_PREEMPT=y, this_cpu_ptr() ends up calling back into raw_smp_processor_id(), resulting in some hilariously catastrophic infinite recursion. In the normal case, we have: #define this_cpu_ptr(ptr) raw_cpu_ptr(ptr) and everything is dandy. However for CONFIG_DEBUG_PREEMPT, this_cpu_ptr() is defined in terms of my_cpu_offset, wherein the fun begins: #define my_cpu_offset per_cpu_offset(smp_processor_id()) ... #define smp_processor_id() debug_smp_processor_id() ... notrace unsigned int debug_smp_processor_id(void) { return check_preemption_disabled("smp_processor_id", ""); ... notrace static unsigned int check_preemption_disabled(const char *what1, const char *what2) { int this_cpu = raw_smp_processor_id(); and bang. Use raw_cpu_ptr() directly to avoid that. Fixes: 57c82954e77f ("arm64: make cpu number a percpu variable") Reported-by: Marek Szyprowski Acked-by: Will Deacon Signed-off-by: Robin Murphy Tested-by: Marek Szyprowski Signed-off-by: Catalin Marinas (cherry picked from commit 34a6980c82fb1342e7064844c95aa4cf933e5ecc) Signed-off-by: John Stultz --- arch/arm64/include/asm/smp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index a05033beb2a2..4325b3622a92 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -28,8 +28,10 @@ DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); * We don't use this_cpu_read(cpu_number) as that has implicit writes to * preempt_count, and associated (compiler) barriers, that we'd like to avoid * the expense of. If we're preemptible, the value can be stale at use anyway. + * And we can't use this_cpu_ptr() either, as that winds up recursing back + * here under CONFIG_DEBUG_PREEMPT=y. */ -#define raw_smp_processor_id() (*this_cpu_ptr(&cpu_number)) +#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number)) struct seq_file; From 623f33f213deb39994decbdc7d3b8cea82c4d558 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Thu, 20 Apr 2017 18:54:13 -0700 Subject: [PATCH 1001/3220] ANDROID: Use sk_uid to replace uid get from socket file Retrieve socket uid from the sk_uid field added to struct sk instead of read it from sk->socket->file. It prevent the packet been dropped when the socket file doesn't exist. Bug: 37524657 Signed-off-by: Chenbo Feng Change-Id: I10545a308d61a2124077cb6f05fb0f5d9b4559ad --- net/netfilter/xt_qtaguid.c | 50 ++++++++++++----------------- net/netfilter/xt_qtaguid_internal.h | 4 +-- 2 files changed, 23 insertions(+), 31 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index f95aba44e649..fbe4e1ecce6a 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1717,18 +1717,9 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) } MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n", par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par)); - if (sk != NULL) { - set_sk_callback_lock = true; - read_lock_bh(&sk->sk_callback_lock); - MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", - par->hooknum, sk, sk->sk_socket, - sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); - filp = sk->sk_socket ? sk->sk_socket->file : NULL; - MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", - par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); - } - if (sk == NULL || sk->sk_socket == NULL) { + + if (sk == NULL) { /* * Here, the qtaguid_find_sk() using connection tracking * couldn't find the owner, so for now we just count them @@ -1741,9 +1732,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) */ if (!(info->match & XT_QTAGUID_UID)) account_for_uid(skb, sk, 0, par); - MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n", - par->hooknum, - sk ? sk->sk_socket : NULL); + MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum); res = (info->match ^ info->invert) == 0; atomic64_inc(&qtu_events.match_no_sk); goto put_sock_ret_res; @@ -1751,16 +1740,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) res = false; goto put_sock_ret_res; } - filp = sk->sk_socket->file; - if (filp == NULL) { - MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum); - account_for_uid(skb, sk, 0, par); - res = ((info->match ^ info->invert) & - (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0; - atomic64_inc(&qtu_events.match_no_sk_file); - goto put_sock_ret_res; - } - sock_uid = filp->f_cred->fsuid; + sock_uid = sk->sk_uid; /* * TODO: unhack how to force just accounting. * For now we only do iface stats when the uid-owner is not requested @@ -1778,8 +1758,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); - if ((uid_gte(filp->f_cred->fsuid, uid_min) && - uid_lte(filp->f_cred->fsuid, uid_max)) ^ + if ((uid_gte(sk->sk_uid, uid_min) && + uid_lte(sk->sk_uid, uid_max)) ^ !(info->invert & XT_QTAGUID_UID)) { MT_DEBUG("qtaguid[%d]: leaving uid not matching\n", par->hooknum); @@ -1790,7 +1770,19 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->match & XT_QTAGUID_GID) { kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); - + set_sk_callback_lock = true; + read_lock_bh(&sk->sk_callback_lock); + MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", + par->hooknum, sk, sk->sk_socket, + sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + if (!filp) { + res = ((info->match ^ info->invert) & XT_QTAGUID_GID) == 0; + atomic64_inc(&qtu_events.match_no_sk_gid); + goto put_sock_ret_res; + } + MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", + par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); if ((gid_gte(filp->f_cred->fsgid, gid_min) && gid_lte(filp->f_cred->fsgid, gid_max)) ^ !(info->invert & XT_QTAGUID_GID)) { @@ -1962,7 +1954,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) "match_found_sk_in_ct=%llu " "match_found_no_sk_in_ct=%llu " "match_no_sk=%llu " - "match_no_sk_file=%llu\n", + "match_no_sk_gid=%llu\n", (u64)atomic64_read(&qtu_events.sockets_tagged), (u64)atomic64_read(&qtu_events.sockets_untagged), (u64)atomic64_read(&qtu_events.counter_set_changes), @@ -1974,7 +1966,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) (u64)atomic64_read(&qtu_events.match_found_sk_in_ct), (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct), (u64)atomic64_read(&qtu_events.match_no_sk), - (u64)atomic64_read(&qtu_events.match_no_sk_file)); + (u64)atomic64_read(&qtu_events.match_no_sk_gid)); /* Count the following as part of the last item_index. No need * to lock the sock_tag_list here since it is already locked when diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h index 8178fbdfb036..c7052707a6a4 100644 --- a/net/netfilter/xt_qtaguid_internal.h +++ b/net/netfilter/xt_qtaguid_internal.h @@ -289,10 +289,10 @@ struct qtaguid_event_counts { */ atomic64_t match_no_sk; /* - * The file ptr in the sk_socket wasn't there. + * The file ptr in the sk_socket wasn't there and we couldn't get GID. * This might happen for traffic while the socket is being closed. */ - atomic64_t match_no_sk_file; + atomic64_t match_no_sk_gid; }; /* Track the set active_set for the given tag. */ From c5c3d46ccc5dbce050a070cd6c70dd36c67a15e5 Mon Sep 17 00:00:00 2001 From: Yang Jin Date: Wed, 26 Jul 2017 12:52:22 -0700 Subject: [PATCH 1002/3220] uid_sys_stats: log task io with a debug flag Add a hashmap inside each uid_entry to keep track of task name and io. Task full name is a combination of thread and process name. Bug: 63739275 Change-Id: I30083b757eaef8c61e55a213a883ce8d0c9cf2b1 Signed-off-by: Yang Jin --- drivers/misc/Kconfig | 7 + drivers/misc/uid_sys_stats.c | 303 ++++++++++++++++++++++++++++++----- 2 files changed, 266 insertions(+), 44 deletions(-) diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 9e649992c177..88056d1e8feb 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -533,6 +533,13 @@ config UID_SYS_STATS Per UID based io statistics exported to /proc/uid_io Per UID based procstat control in /proc/uid_procstat +config UID_SYS_STATS_DEBUG + bool "Per-TASK statistics" + depends on UID_SYS_STATS + default n + help + Per TASK based io statistics exported to /proc/uid_io + config MEMORY_STATE_TIME tristate "Memory freq/bandwidth time statistics" depends on PROFILING diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 3c9d311106cd..2e4364b96b9a 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -52,6 +52,15 @@ struct io_stats { #define UID_STATE_DEAD_TASKS 4 #define UID_STATE_SIZE 5 +#define MAX_TASK_COMM_LEN 256 + +struct task_entry { + char comm[MAX_TASK_COMM_LEN]; + pid_t pid; + struct io_stats io[UID_STATE_SIZE]; + struct hlist_node hash; +}; + struct uid_entry { uid_t uid; cputime_t utime; @@ -61,8 +70,231 @@ struct uid_entry { int state; struct io_stats io[UID_STATE_SIZE]; struct hlist_node hash; +#ifdef CONFIG_UID_SYS_STATS_DEBUG + DECLARE_HASHTABLE(task_entries, UID_HASH_BITS); +#endif }; +static u64 compute_write_bytes(struct task_struct *task) +{ + if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes) + return 0; + + return task->ioac.write_bytes - task->ioac.cancelled_write_bytes; +} + +static void compute_io_bucket_stats(struct io_stats *io_bucket, + struct io_stats *io_curr, + struct io_stats *io_last, + struct io_stats *io_dead) +{ + /* tasks could switch to another uid group, but its io_last in the + * previous uid group could still be positive. + * therefore before each update, do an overflow check first + */ + int64_t delta; + + delta = io_curr->read_bytes + io_dead->read_bytes - + io_last->read_bytes; + io_bucket->read_bytes += delta > 0 ? delta : 0; + delta = io_curr->write_bytes + io_dead->write_bytes - + io_last->write_bytes; + io_bucket->write_bytes += delta > 0 ? delta : 0; + delta = io_curr->rchar + io_dead->rchar - io_last->rchar; + io_bucket->rchar += delta > 0 ? delta : 0; + delta = io_curr->wchar + io_dead->wchar - io_last->wchar; + io_bucket->wchar += delta > 0 ? delta : 0; + delta = io_curr->fsync + io_dead->fsync - io_last->fsync; + io_bucket->fsync += delta > 0 ? delta : 0; + + io_last->read_bytes = io_curr->read_bytes; + io_last->write_bytes = io_curr->write_bytes; + io_last->rchar = io_curr->rchar; + io_last->wchar = io_curr->wchar; + io_last->fsync = io_curr->fsync; + + memset(io_dead, 0, sizeof(struct io_stats)); +} + +#ifdef CONFIG_UID_SYS_STATS_DEBUG +static void get_full_task_comm(struct task_entry *task_entry, + struct task_struct *task) +{ + int i = 0, offset = 0, len = 0; + /* save one byte for terminating null character */ + int unused_len = MAX_TASK_COMM_LEN - TASK_COMM_LEN - 1; + char buf[unused_len]; + struct mm_struct *mm = task->mm; + + /* fill the first TASK_COMM_LEN bytes with thread name */ + get_task_comm(task_entry->comm, task); + i = strlen(task_entry->comm); + while (i < TASK_COMM_LEN) + task_entry->comm[i++] = ' '; + + /* next the executable file name */ + if (mm) { + down_read(&mm->mmap_sem); + if (mm->exe_file) { + char *pathname = d_path(&mm->exe_file->f_path, buf, + unused_len); + + if (!IS_ERR(pathname)) { + len = strlcpy(task_entry->comm + i, pathname, + unused_len); + i += len; + task_entry->comm[i++] = ' '; + unused_len--; + } + } + up_read(&mm->mmap_sem); + } + unused_len -= len; + + /* fill the rest with command line argument + * replace each null or new line character + * between args in argv with whitespace */ + len = get_cmdline(task, buf, unused_len); + while (offset < len) { + if (buf[offset] != '\0' && buf[offset] != '\n') + task_entry->comm[i++] = buf[offset]; + else + task_entry->comm[i++] = ' '; + offset++; + } + + /* get rid of trailing whitespaces in case when arg is memset to + * zero before being reset in userspace + */ + while (task_entry->comm[i-1] == ' ') + i--; + task_entry->comm[i] = '\0'; +} + +static struct task_entry *find_task_entry(struct uid_entry *uid_entry, + struct task_struct *task) +{ + struct task_entry *task_entry; + + hash_for_each_possible(uid_entry->task_entries, task_entry, hash, + task->pid) { + if (task->pid == task_entry->pid) { + /* if thread name changed, update the entire command */ + int len = strnchr(task_entry->comm, ' ', TASK_COMM_LEN) + - task_entry->comm; + + if (strncmp(task_entry->comm, task->comm, len)) + get_full_task_comm(task_entry, task); + return task_entry; + } + } + return NULL; +} + +static struct task_entry *find_or_register_task(struct uid_entry *uid_entry, + struct task_struct *task) +{ + struct task_entry *task_entry; + pid_t pid = task->pid; + + task_entry = find_task_entry(uid_entry, task); + if (task_entry) + return task_entry; + + task_entry = kzalloc(sizeof(struct task_entry), GFP_ATOMIC); + if (!task_entry) + return NULL; + + get_full_task_comm(task_entry, task); + + task_entry->pid = pid; + hash_add(uid_entry->task_entries, &task_entry->hash, (unsigned int)pid); + + return task_entry; +} + +static void remove_uid_tasks(struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + struct hlist_node *tmp_task; + + hash_for_each_safe(uid_entry->task_entries, bkt_task, + tmp_task, task_entry, hash) { + hash_del(&task_entry->hash); + kfree(task_entry); + } +} + +static void set_io_uid_tasks_zero(struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + + hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) { + memset(&task_entry->io[UID_STATE_TOTAL_CURR], 0, + sizeof(struct io_stats)); + } +} + +static void add_uid_tasks_io_stats(struct uid_entry *uid_entry, + struct task_struct *task, int slot) +{ + struct task_entry *task_entry = find_or_register_task(uid_entry, task); + struct io_stats *task_io_slot = &task_entry->io[slot]; + + task_io_slot->read_bytes += task->ioac.read_bytes; + task_io_slot->write_bytes += compute_write_bytes(task); + task_io_slot->rchar += task->ioac.rchar; + task_io_slot->wchar += task->ioac.wchar; + task_io_slot->fsync += task->ioac.syscfs; +} + +static void compute_io_uid_tasks(struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + + hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) { + compute_io_bucket_stats(&task_entry->io[uid_entry->state], + &task_entry->io[UID_STATE_TOTAL_CURR], + &task_entry->io[UID_STATE_TOTAL_LAST], + &task_entry->io[UID_STATE_DEAD_TASKS]); + } +} + +static void show_io_uid_tasks(struct seq_file *m, struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + + hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) { + /* Separated by comma because space exists in task comm */ + seq_printf(m, "task,%s,%lu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", + task_entry->comm, + (unsigned long)task_entry->pid, + task_entry->io[UID_STATE_FOREGROUND].rchar, + task_entry->io[UID_STATE_FOREGROUND].wchar, + task_entry->io[UID_STATE_FOREGROUND].read_bytes, + task_entry->io[UID_STATE_FOREGROUND].write_bytes, + task_entry->io[UID_STATE_BACKGROUND].rchar, + task_entry->io[UID_STATE_BACKGROUND].wchar, + task_entry->io[UID_STATE_BACKGROUND].read_bytes, + task_entry->io[UID_STATE_BACKGROUND].write_bytes, + task_entry->io[UID_STATE_FOREGROUND].fsync, + task_entry->io[UID_STATE_BACKGROUND].fsync); + } +} +#else +static void remove_uid_tasks(struct uid_entry *uid_entry) {}; +static void set_io_uid_tasks_zero(struct uid_entry *uid_entry) {}; +static void add_uid_tasks_io_stats(struct uid_entry *uid_entry, + struct task_struct *task, int slot) {}; +static void compute_io_uid_tasks(struct uid_entry *uid_entry) {}; +static void show_io_uid_tasks(struct seq_file *m, + struct uid_entry *uid_entry) {} +#endif + static struct uid_entry *find_uid_entry(uid_t uid) { struct uid_entry *uid_entry; @@ -86,7 +318,9 @@ static struct uid_entry *find_or_register_uid(uid_t uid) return NULL; uid_entry->uid = uid; - +#ifdef CONFIG_UID_SYS_STATS_DEBUG + hash_init(uid_entry->task_entries); +#endif hash_add(hash_table, &uid_entry->hash, uid); return uid_entry; @@ -192,6 +426,7 @@ static ssize_t uid_remove_write(struct file *file, hash_for_each_possible_safe(hash_table, uid_entry, tmp, hash, (uid_t)uid_start) { if (uid_start == uid_entry->uid) { + remove_uid_tasks(uid_entry); hash_del(&uid_entry->hash); kfree(uid_entry); } @@ -208,13 +443,6 @@ static const struct file_operations uid_remove_fops = { .write = uid_remove_write, }; -static u64 compute_write_bytes(struct task_struct *task) -{ - if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes) - return 0; - - return task->ioac.write_bytes - task->ioac.cancelled_write_bytes; -} static void add_uid_io_stats(struct uid_entry *uid_entry, struct task_struct *task, int slot) @@ -226,28 +454,8 @@ static void add_uid_io_stats(struct uid_entry *uid_entry, io_slot->rchar += task->ioac.rchar; io_slot->wchar += task->ioac.wchar; io_slot->fsync += task->ioac.syscfs; -} -static void compute_uid_io_bucket_stats(struct io_stats *io_bucket, - struct io_stats *io_curr, - struct io_stats *io_last, - struct io_stats *io_dead) -{ - io_bucket->read_bytes += io_curr->read_bytes + io_dead->read_bytes - - io_last->read_bytes; - io_bucket->write_bytes += io_curr->write_bytes + io_dead->write_bytes - - io_last->write_bytes; - io_bucket->rchar += io_curr->rchar + io_dead->rchar - io_last->rchar; - io_bucket->wchar += io_curr->wchar + io_dead->wchar - io_last->wchar; - io_bucket->fsync += io_curr->fsync + io_dead->fsync - io_last->fsync; - - io_last->read_bytes = io_curr->read_bytes; - io_last->write_bytes = io_curr->write_bytes; - io_last->rchar = io_curr->rchar; - io_last->wchar = io_curr->wchar; - io_last->fsync = io_curr->fsync; - - memset(io_dead, 0, sizeof(struct io_stats)); + add_uid_tasks_io_stats(uid_entry, task, slot); } static void update_io_stats_all_locked(void) @@ -258,9 +466,11 @@ static void update_io_stats_all_locked(void) unsigned long bkt; uid_t uid; - hash_for_each(hash_table, bkt, uid_entry, hash) + hash_for_each(hash_table, bkt, uid_entry, hash) { memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, sizeof(struct io_stats)); + set_io_uid_tasks_zero(uid_entry); + } rcu_read_lock(); do_each_thread(temp, task) { @@ -274,10 +484,11 @@ static void update_io_stats_all_locked(void) rcu_read_unlock(); hash_for_each(hash_table, bkt, uid_entry, hash) { - compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state], + compute_io_bucket_stats(&uid_entry->io[uid_entry->state], &uid_entry->io[UID_STATE_TOTAL_CURR], &uid_entry->io[UID_STATE_TOTAL_LAST], &uid_entry->io[UID_STATE_DEAD_TASKS]); + compute_io_uid_tasks(uid_entry); } } @@ -288,6 +499,7 @@ static void update_io_stats_uid_locked(struct uid_entry *uid_entry) memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, sizeof(struct io_stats)); + set_io_uid_tasks_zero(uid_entry); rcu_read_lock(); do_each_thread(temp, task) { @@ -297,12 +509,14 @@ static void update_io_stats_uid_locked(struct uid_entry *uid_entry) } while_each_thread(temp, task); rcu_read_unlock(); - compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state], + compute_io_bucket_stats(&uid_entry->io[uid_entry->state], &uid_entry->io[UID_STATE_TOTAL_CURR], &uid_entry->io[UID_STATE_TOTAL_LAST], &uid_entry->io[UID_STATE_DEAD_TASKS]); + compute_io_uid_tasks(uid_entry); } + static int uid_io_show(struct seq_file *m, void *v) { struct uid_entry *uid_entry; @@ -314,21 +528,22 @@ static int uid_io_show(struct seq_file *m, void *v) hash_for_each(hash_table, bkt, uid_entry, hash) { seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", - uid_entry->uid, - uid_entry->io[UID_STATE_FOREGROUND].rchar, - uid_entry->io[UID_STATE_FOREGROUND].wchar, - uid_entry->io[UID_STATE_FOREGROUND].read_bytes, - uid_entry->io[UID_STATE_FOREGROUND].write_bytes, - uid_entry->io[UID_STATE_BACKGROUND].rchar, - uid_entry->io[UID_STATE_BACKGROUND].wchar, - uid_entry->io[UID_STATE_BACKGROUND].read_bytes, - uid_entry->io[UID_STATE_BACKGROUND].write_bytes, - uid_entry->io[UID_STATE_FOREGROUND].fsync, - uid_entry->io[UID_STATE_BACKGROUND].fsync); + uid_entry->uid, + uid_entry->io[UID_STATE_FOREGROUND].rchar, + uid_entry->io[UID_STATE_FOREGROUND].wchar, + uid_entry->io[UID_STATE_FOREGROUND].read_bytes, + uid_entry->io[UID_STATE_FOREGROUND].write_bytes, + uid_entry->io[UID_STATE_BACKGROUND].rchar, + uid_entry->io[UID_STATE_BACKGROUND].wchar, + uid_entry->io[UID_STATE_BACKGROUND].read_bytes, + uid_entry->io[UID_STATE_BACKGROUND].write_bytes, + uid_entry->io[UID_STATE_FOREGROUND].fsync, + uid_entry->io[UID_STATE_BACKGROUND].fsync); + + show_io_uid_tasks(m, uid_entry); } rt_mutex_unlock(&uid_lock); - return 0; } From ad0af9b183c73b40f0448b587e32bf42227b5d08 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 17 Aug 2017 18:52:04 +0530 Subject: [PATCH 1003/3220] ANDROID: uid_sys_stats: Fix implicit declaration of get_cmdline() Include linux/mm.h for get_cmdline() declaration. Change-Id: Icad6ef7deef4d93d92d423c96bfa61fb5d66d0b7 Fixes: Change-Id: I30083b757eaef8c61e55a213a883ce8d0c9cf2b1 ("uid_sys_stats: log task io with a debug flag") Signed-off-by: Amit Pundir --- drivers/misc/uid_sys_stats.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 2e4364b96b9a..031320e51522 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include From 1f3f566d9b41ceffb15e73da5e32c0e249acd535 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 14 Aug 2017 12:45:38 +0200 Subject: [PATCH 1004/3220] ANDROID: usb: gadget: configfs: fix null ptr in android_disconnect There's a race between usb_gadget_udc_stop() which is likely to set the gadget driver to NULL in the udc driver and this drivers gadget disconnect fn which likely checks for the gadget driver to a null ptr. It happens that unbind (doing set_gadget_data(NULL)) is called before the gadget driver is set to NULL and the udc driver calls disconnect fn which results in cdev being a null ptr. As a workaround we check cdev in android_disconnect() to prevent the following panic: Unable to handle kernel NULL pointer dereference at virtual address 000000a8 pgd = ffffff800940a000 [000000a8] *pgd=00000000be1fe003, *pud=00000000be1fe003, *pmd=0000000000000000 Internal error: Oops: 96000046 [#1] PREEMPT SMP CPU: 7 PID: 1134 Comm: kworker/u16:3 Tainted: G S 4.9.41-g75cd2a0231ea-dirty #4 Hardware name: HiKey960 (DT) Workqueue: events_power_efficient event_work task: ffffffc0b5f4f000 task.stack: ffffffc0b5b94000 PC is at android_disconnect+0x54/0xa4 LR is at android_disconnect+0x54/0xa4 pc : [] lr : [] pstate: 80000185 sp : ffffffc0b5b97bf0 x29: ffffffc0b5b97bf0 x28: 0000000000000003 x27: ffffffc0b5181c54 x26: ffffffc0b5181c68 x25: ffffff8008dc1000 x24: ffffffc0b5181d70 x23: ffffff8008dc18a0 x22: ffffffc0b5f5a018 x21: ffffffc0b5894ad8 x20: 0000000000000000 x19: ffffff8008ddaec8 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 00000000007c9ccd x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000001 x10: 0000000000000001 x9 : ffffff800930f1a8 x8 : ffffff800932a133 x7 : 0000000000000000 x6 : 0000000000000000 x5 : ffffffc0b5b97a50 x4 : ffffffc0be19f090 x3 : 0000000000000000 x2 : ffffff80091ca000 x1 : 000000000000002f x0 : 000000000000002f This happened on a hikey960 with the following backtrace: [] android_disconnect+0x54/0xa4 [] dwc3_disconnect_gadget.part.19+0x114.888119] [] dwc3_gadget_suspend+0x6c/0x70 [] dwc3_suspend_device+0x58/0xa0 [] dwc3_otg_work+0x214/0x474 [] event_work+0x3bc/0x5ac [] process_one_work+0x14c/0x43c [] worker_thread+0x5c/0x438 [] kthread+0xec/0x100 [] ret_from_fork+0x10/0x50 dwc3_otg_work tries to handle a switch from host to device mode and therefore calls disconnect on the gadget driver. To reproduce the issue it is enaugh to enable tethering (rndis gadget), unplug and plug in again the usb connector which causes the change from device to host and back to device mode. Signed-off-by: Danilo Krummrich --- drivers/usb/gadget/configfs.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 54849fe9cb01..a0bd3c542d06 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -1524,6 +1524,18 @@ static void android_disconnect(struct usb_gadget *gadget) struct usb_composite_dev *cdev = get_gadget_data(gadget); struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev); + /* FIXME: There's a race between usb_gadget_udc_stop() which is likely + * to set the gadget driver to NULL in the udc driver and this drivers + * gadget disconnect fn which likely checks for the gadget driver to + * be a null ptr. It happens that unbind (doing set_gadget_data(NULL)) + * is called before the gadget driver is set to NULL and the udc driver + * calls disconnect fn which results in cdev being a null ptr. + */ + if (cdev == NULL) { + WARN(1, "%s: gadget driver already disconnected\n", __func__); + return; + } + /* accessory HID support can be active while the accessory function is not actually enabled, so we need to inform it when we are disconnected. From 967ca302456550859bfea3cda9971a3035b6889f Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 21 Jul 2017 14:58:16 -0700 Subject: [PATCH 1005/3220] ANDROID: usb: gadget: assign no-op request complete callbacks The req->complete seems to presist the callback pointer for the control requests. This causes the serial of the accessory to be overridden when an accessory function specific out control request is issued right after the ACCESSORY_SEND_STRING control request. Therefore, assign a no-op req complete function when nothing needs to be done once the request is completed. Signed-off-by: Badhri Jagan Sridharan Bug: 63867169 Change-Id: I78c1602e9a044b8718b270b8a068cf5afc83f984 --- drivers/usb/gadget/function/f_accessory.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index 76b8ae08a551..d31c0809046f 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -812,6 +812,14 @@ static struct hid_driver acc_hid_driver = { .probe = acc_hid_probe, }; +static void acc_complete_setup_noop(struct usb_ep *ep, struct usb_request *req) +{ + /* + * Default no-op function when nothing needs to be done for the + * setup request + */ +} + int acc_ctrlrequest(struct usb_composite_dev *cdev, const struct usb_ctrlrequest *ctrl) { @@ -839,6 +847,7 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev, schedule_delayed_work( &dev->start_work, msecs_to_jiffies(10)); value = 0; + cdev->req->complete = acc_complete_setup_noop; } else if (b_request == ACCESSORY_SEND_STRING) { dev->string_index = w_index; cdev->gadget->ep0->driver_data = dev; @@ -847,10 +856,13 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev, } else if (b_request == ACCESSORY_SET_AUDIO_MODE && w_index == 0 && w_length == 0) { dev->audio_mode = w_value; + cdev->req->complete = acc_complete_setup_noop; value = 0; } else if (b_request == ACCESSORY_REGISTER_HID) { + cdev->req->complete = acc_complete_setup_noop; value = acc_register_hid(dev, w_value, w_index); } else if (b_request == ACCESSORY_UNREGISTER_HID) { + cdev->req->complete = acc_complete_setup_noop; value = acc_unregister_hid(dev, w_value); } else if (b_request == ACCESSORY_SET_HID_REPORT_DESC) { spin_lock_irqsave(&dev->lock, flags); @@ -885,7 +897,7 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev, if (b_request == ACCESSORY_GET_PROTOCOL) { *((u16 *)cdev->req->buf) = PROTOCOL_VERSION; value = sizeof(u16); - + cdev->req->complete = acc_complete_setup_noop; /* clear any string left over from a previous session */ memset(dev->manufacturer, 0, sizeof(dev->manufacturer)); memset(dev->model, 0, sizeof(dev->model)); From 9e6afd4e3684b5a6664349fc87c6b9fb48c375ae Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Wed, 2 Dec 2015 10:06:45 -0600 Subject: [PATCH 1006/3220] BACKPORT: usb: dwc3: gadget: handle request->zero So far, dwc3 has always missed request->zero handling for every endpoint. Let's implement that so we can handle cases where transfer must be finished with a ZLP. Note that dwc3 is a little special. Even though we're dealing with a ZLP, we still need a buffer of wMaxPacketSize bytes; to hide that detail from every gadget driver, we have a preallocated buffer of 1024 bytes (biggest bulk size) to use (and share) among all endpoints. (cherry-picked from commit 04c03d10e507052cfce6910ddf34091196e79e1c) Reported-by: Ravi B Signed-off-by: Felipe Balbi Signed-off-by: Badhri Jagan Sridharan Bug: 63867169 Change-Id: Ied4093e92dd080f2d9dd1fd9aefb36406f66bb67 --- drivers/usb/dwc3/core.h | 3 +++ drivers/usb/dwc3/gadget.c | 49 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index 68d11d7d4028..d5b604a2c325 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -37,6 +37,7 @@ #define DWC3_MSG_MAX 500 /* Global constants */ +#define DWC3_ZLP_BUF_SIZE 1024 /* size of a superspeed bulk */ #define DWC3_EP0_BOUNCE_SIZE 512 #define DWC3_ENDPOINTS_NUM 32 #define DWC3_XHCI_RESOURCES_NUM 2 @@ -645,6 +646,7 @@ struct dwc3_scratchpad_array { * @ctrl_req: usb control request which is used for ep0 * @ep0_trb: trb which is used for the ctrl_req * @ep0_bounce: bounce buffer for ep0 + * @zlp_buf: used when request->zero is set * @setup_buf: used while precessing STD USB requests * @ctrl_req_addr: dma address of ctrl_req * @ep0_trb: dma address of ep0_trb @@ -732,6 +734,7 @@ struct dwc3 { struct usb_ctrlrequest *ctrl_req; struct dwc3_trb *ep0_trb; void *ep0_bounce; + void *zlp_buf; void *scratchbuf; u8 *setup_buf; dma_addr_t ctrl_req_addr; diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index d3bd1afd6302..c0d21367d107 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1200,6 +1200,32 @@ out: return ret; } +static void __dwc3_gadget_ep_zlp_complete(struct usb_ep *ep, + struct usb_request *request) +{ + dwc3_gadget_ep_free_request(ep, request); +} + +static int __dwc3_gadget_ep_queue_zlp(struct dwc3 *dwc, struct dwc3_ep *dep) +{ + struct dwc3_request *req; + struct usb_request *request; + struct usb_ep *ep = &dep->endpoint; + + dwc3_trace(trace_dwc3_gadget, "queueing ZLP\n"); + request = dwc3_gadget_ep_alloc_request(ep, GFP_ATOMIC); + if (!request) + return -ENOMEM; + + request->length = 0; + request->buf = dwc->zlp_buf; + request->complete = __dwc3_gadget_ep_zlp_complete; + + req = to_dwc3_request(request); + + return __dwc3_gadget_ep_queue(dep, req); +} + static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request, gfp_t gfp_flags) { @@ -1227,6 +1253,15 @@ static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request, ret = __dwc3_gadget_ep_queue(dep, req); + /* + * Okay, here's the thing, if gadget driver has requested for a ZLP by + * setting request->zero, instead of doing magic, we will just queue an + * extra usb_request ourselves so that it gets handled the same way as + * any other request. + */ + if (ret == 0 && request->zero && (request->length % ep->maxpacket == 0)) + ret = __dwc3_gadget_ep_queue_zlp(dwc, dep); + out: spin_unlock_irqrestore(&dwc->lock, flags); @@ -2798,6 +2833,12 @@ int dwc3_gadget_init(struct dwc3 *dwc) goto err3; } + dwc->zlp_buf = kzalloc(DWC3_ZLP_BUF_SIZE, GFP_KERNEL); + if (!dwc->zlp_buf) { + ret = -ENOMEM; + goto err4; + } + dwc->gadget.ops = &dwc3_gadget_ops; dwc->gadget.speed = USB_SPEED_UNKNOWN; dwc->gadget.sg_supported = true; @@ -2839,16 +2880,19 @@ int dwc3_gadget_init(struct dwc3 *dwc) ret = dwc3_gadget_init_endpoints(dwc); if (ret) - goto err4; + goto err5; ret = usb_add_gadget_udc(dwc->dev, &dwc->gadget); if (ret) { dev_err(dwc->dev, "failed to register udc\n"); - goto err4; + goto err5; } return 0; +err5: + kfree(dwc->zlp_buf); + err4: dwc3_gadget_free_endpoints(dwc); dma_free_coherent(dwc->dev, DWC3_EP0_BOUNCE_SIZE, @@ -2881,6 +2925,7 @@ void dwc3_gadget_exit(struct dwc3 *dwc) dwc->ep0_bounce, dwc->ep0_bounce_addr); kfree(dwc->setup_buf); + kfree(dwc->zlp_buf); dma_free_coherent(dwc->dev, sizeof(*dwc->ep0_trb) * 2, dwc->ep0_trb, dwc->ep0_trb_addr); From 8e7cf0b11e316ba11c7fc3b9bc3ba331f589cc98 Mon Sep 17 00:00:00 2001 From: John Youn Date: Tue, 22 Dec 2015 12:23:20 -0800 Subject: [PATCH 1007/3220] UPSTREAM: usb: dwc3: gadget: don't send extra ZLP If the request->length is zero, a ZLP should already be sent due to that and another ZLP is not needed to terminate the transfer. (cherry-picked from commit d9261898a4b2c143c28568dc686a1becfc637a99) Fixes: 04c03d10e507 ("usb: dwc3: gadget: handle request->zero") Signed-off-by: John Youn Signed-off-by: Felipe Balbi Signed-off-by: Badhri Jagan Sridharan Bug: 63867169 Change-Id: I97a1801c57dc169b6e9371d5aec599a14f316aff --- drivers/usb/dwc3/gadget.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index c0d21367d107..3368c3a88a57 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1259,7 +1259,8 @@ static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request, * extra usb_request ourselves so that it gets handled the same way as * any other request. */ - if (ret == 0 && request->zero && (request->length % ep->maxpacket == 0)) + if (ret == 0 && request->zero && request->length && + (request->length % ep->maxpacket == 0)) ret = __dwc3_gadget_ep_queue_zlp(dwc, dep); out: From eb4610b979435c98821eefd716b6117a7ca0478d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 17 Aug 2017 10:43:14 -0700 Subject: [PATCH 1008/3220] ANDROID: NFC: st21nfca: Fix out of bounds kernel access when handling ATR_REQ Out of bounds kernel accesses in st21nfca's NFC HCI layer might happen when handling ATR_REQ events if user-specified atr_req->length is bigger than the buffer size. In that case memcpy() inside st21nfca_tm_send_atr_res() will read extra bytes resulting in OOB read from the kernel heap. Bug: 62679012 Signed-off-by: Suren Baghdasaryan --- drivers/nfc/st21nfca/dep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nfc/st21nfca/dep.c b/drivers/nfc/st21nfca/dep.c index 798a32bbac5d..206285210ab5 100644 --- a/drivers/nfc/st21nfca/dep.c +++ b/drivers/nfc/st21nfca/dep.c @@ -217,7 +217,8 @@ static int st21nfca_tm_recv_atr_req(struct nfc_hci_dev *hdev, atr_req = (struct st21nfca_atr_req *)skb->data; - if (atr_req->length < sizeof(struct st21nfca_atr_req)) { + if (atr_req->length < sizeof(struct st21nfca_atr_req) || + atr_req->length > skb->len) { r = -EPROTO; goto exit; } From 723c3e0fec28b9bdd15bbc968a14cb584fdd014b Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 17 Aug 2017 16:30:47 -0700 Subject: [PATCH 1009/3220] ANDROID: nfc: fdp: Fix possible buffer overflow in WCS4000 NFC driver Possible buffer overflow when reading next_read_size bytes into tmp buffer after next_read_size was extracted from a previous packet. Bug: 62678828 Signed-off-by: Suren Baghdasaryan --- drivers/nfc/fdp/i2c.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nfc/fdp/i2c.c b/drivers/nfc/fdp/i2c.c index a5d7332dfce5..2d043415f326 100644 --- a/drivers/nfc/fdp/i2c.c +++ b/drivers/nfc/fdp/i2c.c @@ -177,6 +177,16 @@ static int fdp_nci_i2c_read(struct fdp_i2c_phy *phy, struct sk_buff **skb) /* Packet that contains a length */ if (tmp[0] == 0 && tmp[1] == 0) { phy->next_read_size = (tmp[2] << 8) + tmp[3] + 3; + /* + * Ensure next_read_size does not exceed sizeof(tmp) + * for reading that many bytes during next iteration + */ + if (phy->next_read_size > FDP_NCI_I2C_MAX_PAYLOAD) { + dev_dbg(&client->dev, "%s: corrupted packet\n", + __func__); + phy->next_read_size = 5; + goto flush; + } } else { phy->next_read_size = FDP_NCI_I2C_MIN_PAYLOAD; From 3f4427d43b3b684ec867be2ca2ae3287ba6dde46 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 17 Aug 2017 15:50:54 -0700 Subject: [PATCH 1010/3220] ANDROID: NFC: Fix possible memory corruption when handling SHDLC I-Frame commands When handling SHDLC I-Frame commands "pipe" field used for indexing into an array should be checked before usage. If left unchecked it might access memory outside of the array of size NFC_HCI_MAX_PIPES(127). Bug: 62679701 Signed-off-by: Suren Baghdasaryan --- net/nfc/hci/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 2b0f0ac498d2..5a58f9f38095 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -209,6 +209,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } create_info = (struct hci_create_pipe_resp *)skb->data; + if (create_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id * but since we received this cmd from host controller, we @@ -232,6 +237,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } delete_info = (struct hci_delete_pipe_noti *)skb->data; + if (delete_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE; hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST; break; From 14a42657da62150b5ef6358d5cbb63250fd12c76 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 15 Aug 2017 15:12:24 -0700 Subject: [PATCH 1011/3220] ANDROID: check dir value of xfrm_userpolicy_id Check user provided dir value to prevent out-of-bound access which may occur if dir is not less than XFRM_POLICY_MAX. (url: http://seclists.org/bugtraq/2017/Jul/30) Bug: 64257838 Signed-off-by: Suren Baghdasaryan Change-Id: I5bbdf95e14a61bdf5207977d9a5a4465bc848da0 --- net/xfrm/xfrm_user.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 7a5a64e70b4d..4c696d4d5ce3 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1691,6 +1691,10 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, struct sk_buff *skb; int err; + err = verify_policy_dir(dir); + if (err) + return ERR_PTR(err); + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); @@ -2216,6 +2220,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, int n = 0; struct net *net = sock_net(skb->sk); + err = verify_policy_dir(pi->dir); + if (err) + return err; + if (attrs[XFRMA_MIGRATE] == NULL) return -EINVAL; @@ -2331,6 +2339,11 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, { struct net *net = &init_net; struct sk_buff *skb; + int err; + + err = verify_policy_dir(dir); + if (err) + return err; skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k), GFP_ATOMIC); if (skb == NULL) @@ -2985,6 +2998,11 @@ out_free_skb: static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { + int err; + + err = verify_policy_dir(dir); + if (err) + return err; switch (c->event) { case XFRM_MSG_NEWPOLICY: From efc949fedd30023a30ea34586b485554932c878f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 17 Aug 2017 13:58:40 -0700 Subject: [PATCH 1012/3220] ANDROID: NFC: st21nfca: Fix memory OOB and leak issues in connectivity events handler Overflow on memcpy is possible in kernel driver for st21nfca's NFC HCI layer when handling connectivity events if aid_len or params_len are bigger than the buffer size. Memory leak is possible when parameter tag is invalid. Bug: 62679581 Signed-off-by: Suren Baghdasaryan --- drivers/nfc/st21nfca/se.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c index c79d99b24c96..2d4b6e910b87 100644 --- a/drivers/nfc/st21nfca/se.c +++ b/drivers/nfc/st21nfca/se.c @@ -321,23 +321,33 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, * AID 81 5 to 16 * PARAMETERS 82 0 to 255 */ - if (skb->len < NFC_MIN_AID_LENGTH + 2 && + if (skb->len < NFC_MIN_AID_LENGTH + 2 || skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG) return -EPROTO; + /* + * Buffer should have enough space for at least + * two tag fields + two length fields + aid_len (skb->data[1]) + */ + if (skb->len < skb->data[1] + 4) + return -EPROTO; + transaction = (struct nfc_evt_transaction *)devm_kzalloc(dev, skb->len - 2, GFP_KERNEL); transaction->aid_len = skb->data[1]; memcpy(transaction->aid, &skb->data[2], transaction->aid_len); - - /* Check next byte is PARAMETERS tag (82) */ - if (skb->data[transaction->aid_len + 2] != - NFC_EVT_TRANSACTION_PARAMS_TAG) - return -EPROTO; - transaction->params_len = skb->data[transaction->aid_len + 3]; + + /* Check next byte is PARAMETERS tag (82) and the length field */ + if (skb->data[transaction->aid_len + 2] != + NFC_EVT_TRANSACTION_PARAMS_TAG || + skb->len < transaction->aid_len + transaction->params_len + 4) { + devm_kfree(dev, transaction); + return -EPROTO; + } + memcpy(transaction->params, skb->data + transaction->aid_len + 4, transaction->params_len); From 1ed33cf95476666656d510fab15d87ee04e2327d Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Aug 2017 09:56:27 -0700 Subject: [PATCH 1013/3220] UPSTREAM: cpufreq: schedutil: Make iowait boost more energy efficient Currently the iowait_boost feature in schedutil makes the frequency go to max on iowait wakeups. This feature was added to handle a case that Peter described where the throughput of operations involving continuous I/O requests [1] is reduced due to running at a lower frequency, however the lower throughput itself causes utilization to be low and hence causing frequency to be low hence its "stuck". Instead of going to max, its also possible to achieve the same effect by ramping up to max if there are repeated in_iowait wakeups happening. This patch is an attempt to do that. We start from a lower frequency (policy->min) and double the boost for every consecutive iowait update until we reach the maximum iowait boost frequency (iowait_boost_max). I ran a synthetic test (continuous O_DIRECT writes in a loop) on an x86 machine with intel_pstate in passive mode using schedutil. In this test the iowait_boost value ramped from 800MHz to 4GHz in 60ms. The patch achieves the desired improved throughput as the existing behavior. [1] https://patchwork.kernel.org/patch/9735885/ Change-Id: I4a018434a50f4ca29ec15b03465f6dc212e54423 Cc: Srinivas Pandruvada Cc: Len Brown Cc: Rafael J. Wysocki Cc: Viresh Kumar Cc: Ingo Molnar Cc: Peter Zijlstra Suggested-by: Peter Zijlstra Suggested-by: Viresh Kumar Signed-off-by: Joel Fernandes --- kernel/sched/cpufreq_schedutil.c | 38 +++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e12309c1b07b..f6b95ca15023 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -64,6 +64,7 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; + bool iowait_boost_pending; unsigned long iowait_boost; unsigned long iowait_boost_max; u64 last_update; @@ -224,30 +225,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { if (flags & SCHED_CPUFREQ_IOWAIT) { - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + if (sg_cpu->iowait_boost_pending) + return; + + sg_cpu->iowait_boost_pending = true; + + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost <<= 1; + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + } else { + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + } } else if (sg_cpu->iowait_boost) { s64 delta_ns = time - sg_cpu->last_update; /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (delta_ns > TICK_NSEC) + if (delta_ns > TICK_NSEC) { sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_pending = false; + } } } static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, unsigned long *max) { - unsigned long boost_util = sg_cpu->iowait_boost; - unsigned long boost_max = sg_cpu->iowait_boost_max; + unsigned long boost_util, boost_max; - if (!boost_util) + if (!sg_cpu->iowait_boost) return; + if (sg_cpu->iowait_boost_pending) { + sg_cpu->iowait_boost_pending = false; + } else { + sg_cpu->iowait_boost >>= 1; + if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + sg_cpu->iowait_boost = 0; + return; + } + } + + boost_util = sg_cpu->iowait_boost; + boost_max = sg_cpu->iowait_boost_max; + if (*util * boost_max < *max * boost_util) { *util = boost_util; *max = boost_max; } - sg_cpu->iowait_boost >>= 1; } #ifdef CONFIG_NO_HZ_COMMON @@ -320,6 +345,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) delta_ns = last_freq_update_time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; + j_sg_cpu->iowait_boost_pending = false; continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_DL) From 7842de4545c71363ef599173b66ab5e1933c43e1 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 23 Jul 2017 08:54:26 -0700 Subject: [PATCH 1014/3220] UPSTREAM: cpufreq: schedutil: Use unsigned int for iowait boost Make iowait_boost and iowait_boost_max as unsigned int since its unit is kHz and this is consistent with struct cpufreq_policy. Also change the local variables in sugov_iowait_boost to match this. Change-Id: I6c67ed94c57c4bdb24bada4b97045593fcb95d2e Cc: Srinivas Pandruvada Cc: Len Brown Cc: Rafael J. Wysocki Cc: Viresh Kumar Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Joel Fernandes --- kernel/sched/cpufreq_schedutil.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index f6b95ca15023..a0bcf488fb81 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -65,8 +65,8 @@ struct sugov_cpu { struct sugov_policy *sg_policy; bool iowait_boost_pending; - unsigned long iowait_boost; - unsigned long iowait_boost_max; + unsigned int iowait_boost; + unsigned int iowait_boost_max; u64 last_update; /* The fields below are only needed when sharing a policy. */ @@ -251,7 +251,7 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, unsigned long *max) { - unsigned long boost_util, boost_max; + unsigned int boost_util, boost_max; if (!sg_cpu->iowait_boost) return; From 9c4d6ba9981838ba4c703cf432ec52b6ca36aaa2 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 29 Aug 2017 10:36:40 +0530 Subject: [PATCH 1015/3220] android: android-base.config: enable IP6_NF_MATCH_RPFILTER USB tethering has a hard dependency on ip6t_rpfilter module now. So enable IP6_NF_MATCH_RPFILTER config to make it work again, otherwise we run into following failures when we try to enable USB tethering on Hikey: W IptablesRestoreController: iptables-restore process 1893 terminated status=256 E IptablesRestoreController: iptables error: E IptablesRestoreController: ------- COMMAND ------- E IptablesRestoreController: *raw E IptablesRestoreController: -A natctrl_raw_PREROUTING -i usb0 -m rpfilter --invert ! -s fe80::/64 -j DROP E IptablesRestoreController: COMMIT E IptablesRestoreController: E IptablesRestoreController: ------- ERROR ------- E IptablesRestoreController: ip6tables-restore: line 319 failed E IptablesRestoreController: ---------------------- E NatController: Error setting forward rules E Tethering: [usb0] ERROR Exception enabling NAT: java.lang.IllegalStateException: command '55 nat enable usb0 wlan0 1 192.168.42.0/24' failed with '400 55 Nat operation failed (No such device)' Change-Id: I4a4e192ee1c81a7a425eefee9a2a53dd41e1fa0e Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 48b2cdbe8d49..7199561cd42b 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -36,6 +36,7 @@ CONFIG_INET_XFRM_MODE_TUNNEL=y CONFIG_IP6_NF_FILTER=y CONFIG_IP6_NF_IPTABLES=y CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_MATCH_RPFILTER=y CONFIG_IP6_NF_RAW=y CONFIG_IP6_NF_TARGET_REJECT=y CONFIG_IPV6=y From 0e05bd2dc0e79d18a31bcfb2474c2ecaf7d54073 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Fri, 30 Jun 2017 10:22:23 -0700 Subject: [PATCH 1016/3220] FROMLIST: android: binder: Refactor prev and next buffer into a helper function (from https://patchwork.kernel.org/patch/9928605/) Use helper functions buffer_next and buffer_prev instead of list_entry to get the next and previous buffers. Bug: 36007193 Change-Id: I422dce84afde3d2138a6d976593b109a9cc49003 Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index aabfebac6e57..134c649e5a24 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -48,14 +48,23 @@ module_param_named(debug_mask, binder_alloc_debug_mask, pr_info(x); \ } while (0) +static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer) +{ + return list_entry(buffer->entry.next, struct binder_buffer, entry); +} + +static struct binder_buffer *binder_buffer_prev(struct binder_buffer *buffer) +{ + return list_entry(buffer->entry.prev, struct binder_buffer, entry); +} + static size_t binder_alloc_buffer_size(struct binder_alloc *alloc, struct binder_buffer *buffer) { if (list_is_last(&buffer->entry, &alloc->buffers)) return alloc->buffer + alloc->buffer_size - (void *)buffer->data; - return (size_t)list_entry(buffer->entry.next, - struct binder_buffer, entry) - (size_t)buffer->data; + return (size_t)binder_buffer_next(buffer) - (size_t)buffer->data; } static void binder_insert_free_buffer(struct binder_alloc *alloc, @@ -470,7 +479,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, int free_page_start = 1; BUG_ON(alloc->buffers.next == &buffer->entry); - prev = list_entry(buffer->entry.prev, struct binder_buffer, entry); + prev = binder_buffer_prev(buffer); BUG_ON(!prev->free); if (buffer_end_page(prev) == buffer_start_page(buffer)) { free_page_start = 0; @@ -482,8 +491,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, } if (!list_is_last(&buffer->entry, &alloc->buffers)) { - next = list_entry(buffer->entry.next, - struct binder_buffer, entry); + next = binder_buffer_next(buffer); if (buffer_start_page(next) == buffer_end_page(buffer)) { free_page_end = 0; if (buffer_start_page(next) == @@ -544,8 +552,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, rb_erase(&buffer->rb_node, &alloc->allocated_buffers); buffer->free = 1; if (!list_is_last(&buffer->entry, &alloc->buffers)) { - struct binder_buffer *next = list_entry(buffer->entry.next, - struct binder_buffer, entry); + struct binder_buffer *next = binder_buffer_next(buffer); if (next->free) { rb_erase(&next->rb_node, &alloc->free_buffers); @@ -553,8 +560,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, } } if (alloc->buffers.next != &buffer->entry) { - struct binder_buffer *prev = list_entry(buffer->entry.prev, - struct binder_buffer, entry); + struct binder_buffer *prev = binder_buffer_prev(buffer); if (prev->free) { binder_delete_free_buffer(alloc, buffer); From 3de14ff34cd4a21a7890e3d41f3532ab97424ae3 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Thu, 22 Jun 2017 14:37:45 -0700 Subject: [PATCH 1017/3220] FROMLIST: android: binder: Add allocator selftest (from https://patchwork.kernel.org/patch/9928609/) binder_alloc_selftest tests that alloc_new_buf handles page allocation and deallocation properly when allocate and free buffers. The test allocates 5 buffers of various sizes to cover all possible page alignment cases, and frees the buffers using a list of exhaustive freeing order. Test: boot the device with ANDROID_BINDER_IPC_SELFTEST config option enabled. Allocator selftest passes. Bug: 36007193 Change-Id: I2fe396232b7dfe4bbc50bdba99ca0de9be63cc37 Signed-off-by: Sherry Yang --- drivers/android/Kconfig | 10 + drivers/android/Makefile | 1 + drivers/android/binder.c | 2 + drivers/android/binder_alloc.h | 5 + drivers/android/binder_alloc_selftest.c | 271 ++++++++++++++++++++++++ 5 files changed, 289 insertions(+) create mode 100644 drivers/android/binder_alloc_selftest.c diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 4d4cdc1a6e25..01de42c8b74b 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -44,6 +44,16 @@ config ANDROID_BINDER_IPC_32BIT Note that enabling this will break newer Android user-space. +config ANDROID_BINDER_IPC_SELFTEST + bool "Android Binder IPC Driver Selftest" + depends on ANDROID_BINDER_IPC + ---help--- + This feature allows binder selftest to run. + + Binder selftest checks the allocation and free of binder buffers + exhaustively with combinations of various buffer sizes and + alignments. + endif # if ANDROID endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index 4b7c726bb560..a01254c43ee3 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -1,3 +1,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o +obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o diff --git a/drivers/android/binder.c b/drivers/android/binder.c index bfdd52ea0d1c..75cbbd2e986a 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4580,6 +4580,8 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) /*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/ + binder_selftest_alloc(&proc->alloc); + trace_binder_ioctl(cmd, arg); ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 088e4ffc6230..4f02cc084c15 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -102,6 +102,11 @@ struct binder_alloc { int pid; }; +#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST +void binder_selftest_alloc(struct binder_alloc *alloc); +#else +static inline void binder_selftest_alloc(struct binder_alloc *alloc) {} +#endif extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, size_t data_size, size_t offsets_size, diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c new file mode 100644 index 000000000000..cc00ab6ee29d --- /dev/null +++ b/drivers/android/binder_alloc_selftest.c @@ -0,0 +1,271 @@ +/* binder_alloc_selftest.c + * + * Android IPC Subsystem + * + * Copyright (C) 2017 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include "binder_alloc.h" + +#define BUFFER_NUM 5 +#define BUFFER_MIN_SIZE (PAGE_SIZE / 8) + +static bool binder_selftest_run = true; +static int binder_selftest_failures; +static DEFINE_MUTEX(binder_selftest_lock); + +/** + * enum buf_end_align_type - Page alignment of a buffer + * end with regard to the end of the previous buffer. + * + * In the pictures below, buf2 refers to the buffer we + * are aligning. buf1 refers to previous buffer by addr. + * Symbol [ means the start of a buffer, ] means the end + * of a buffer, and | means page boundaries. + */ +enum buf_end_align_type { + /** + * @SAME_PAGE_UNALIGNED: The end of this buffer is on + * the same page as the end of the previous buffer and + * is not page aligned. Examples: + * buf1 ][ buf2 ][ ... + * buf1 ]|[ buf2 ][ ... + */ + SAME_PAGE_UNALIGNED = 0, + /** + * @SAME_PAGE_ALIGNED: When the end of the previous buffer + * is not page aligned, the end of this buffer is on the + * same page as the end of the previous buffer and is page + * aligned. When the previous buffer is page aligned, the + * end of this buffer is aligned to the next page boundary. + * Examples: + * buf1 ][ buf2 ]| ... + * buf1 ]|[ buf2 ]| ... + */ + SAME_PAGE_ALIGNED, + /** + * @NEXT_PAGE_UNALIGNED: The end of this buffer is on + * the page next to the end of the previous buffer and + * is not page aligned. Examples: + * buf1 ][ buf2 | buf2 ][ ... + * buf1 ]|[ buf2 | buf2 ][ ... + */ + NEXT_PAGE_UNALIGNED, + /** + * @NEXT_PAGE_ALIGNED: The end of this buffer is on + * the page next to the end of the previous buffer and + * is page aligned. Examples: + * buf1 ][ buf2 | buf2 ]| ... + * buf1 ]|[ buf2 | buf2 ]| ... + */ + NEXT_PAGE_ALIGNED, + /** + * @NEXT_NEXT_UNALIGNED: The end of this buffer is on + * the page that follows the page after the end of the + * previous buffer and is not page aligned. Examples: + * buf1 ][ buf2 | buf2 | buf2 ][ ... + * buf1 ]|[ buf2 | buf2 | buf2 ][ ... + */ + NEXT_NEXT_UNALIGNED, + LOOP_END, +}; + +static void pr_err_size_seq(size_t *sizes, int *seq) +{ + int i; + + pr_err("alloc sizes: "); + for (i = 0; i < BUFFER_NUM; i++) + pr_cont("[%zu]", sizes[i]); + pr_cont("\n"); + pr_err("free seq: "); + for (i = 0; i < BUFFER_NUM; i++) + pr_cont("[%d]", seq[i]); + pr_cont("\n"); +} + +static bool check_buffer_pages_allocated(struct binder_alloc *alloc, + struct binder_buffer *buffer, + size_t size) +{ + void *page_addr, *end; + int page_index; + + end = (void *)PAGE_ALIGN((uintptr_t)buffer + size); + for (page_addr = buffer; page_addr < end; page_addr += PAGE_SIZE) { + page_index = (page_addr - alloc->buffer) / PAGE_SIZE; + if (!alloc->pages[page_index]) { + pr_err("incorrect alloc state at page index %d\n", + page_index); + return false; + } + } + return true; +} + +static void binder_selftest_alloc_buf(struct binder_alloc *alloc, + struct binder_buffer *buffers[], + size_t *sizes, int *seq) +{ + int i; + + for (i = 0; i < BUFFER_NUM; i++) { + buffers[i] = binder_alloc_new_buf(alloc, sizes[i], 0, 0, 0); + if (IS_ERR(buffers[i]) || + !check_buffer_pages_allocated(alloc, buffers[i], + sizes[i])) { + pr_err_size_seq(sizes, seq); + binder_selftest_failures++; + } + } +} + +static void binder_selftest_free_buf(struct binder_alloc *alloc, + struct binder_buffer *buffers[], + size_t *sizes, int *seq) +{ + int i; + + for (i = 0; i < BUFFER_NUM; i++) + binder_alloc_free_buf(alloc, buffers[seq[i]]); + + for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) { + if ((!alloc->pages[i]) == (i == 0)) { + pr_err("incorrect free state at page index %d\n", i); + binder_selftest_failures++; + } + } +} + +static void binder_selftest_alloc_free(struct binder_alloc *alloc, + size_t *sizes, int *seq) +{ + struct binder_buffer *buffers[BUFFER_NUM]; + + binder_selftest_alloc_buf(alloc, buffers, sizes, seq); + binder_selftest_free_buf(alloc, buffers, sizes, seq); +} + +static bool is_dup(int *seq, int index, int val) +{ + int i; + + for (i = 0; i < index; i++) { + if (seq[i] == val) + return true; + } + return false; +} + +/* Generate BUFFER_NUM factorial free orders. */ +static void binder_selftest_free_seq(struct binder_alloc *alloc, + size_t *sizes, int *seq, int index) +{ + int i; + + if (index == BUFFER_NUM) { + binder_selftest_alloc_free(alloc, sizes, seq); + return; + } + for (i = 0; i < BUFFER_NUM; i++) { + if (is_dup(seq, index, i)) + continue; + seq[index] = i; + binder_selftest_free_seq(alloc, sizes, seq, index + 1); + } +} + +static void binder_selftest_alloc_size(struct binder_alloc *alloc, + size_t *end_offset) +{ + int i; + int seq[BUFFER_NUM] = {0}; + size_t front_sizes[BUFFER_NUM]; + size_t back_sizes[BUFFER_NUM]; + size_t last_offset, offset = 0; + + for (i = 0; i < BUFFER_NUM; i++) { + last_offset = offset; + offset = end_offset[i]; + front_sizes[i] = offset - last_offset; + back_sizes[BUFFER_NUM - i - 1] = front_sizes[i]; + } + /* + * Buffers share the first or last few pages. + * Only BUFFER_NUM - 1 buffer sizes are adjustable since + * we need one giant buffer before getting to the last page. + */ + back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1] + - sizeof(struct binder_buffer) * BUFFER_NUM; + binder_selftest_free_seq(alloc, front_sizes, seq, 0); + binder_selftest_free_seq(alloc, back_sizes, seq, 0); +} + +static void binder_selftest_alloc_offset(struct binder_alloc *alloc, + size_t *end_offset, int index) +{ + int align; + size_t end, prev; + + if (index == BUFFER_NUM) { + binder_selftest_alloc_size(alloc, end_offset); + return; + } + prev = index == 0 ? 0 : end_offset[index - 1]; + end = prev; + + BUILD_BUG_ON((BUFFER_MIN_SIZE + sizeof(struct binder_buffer)) + * BUFFER_NUM >= PAGE_SIZE); + + for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) { + if (align % 2) + end = ALIGN(end, PAGE_SIZE); + else + end += BUFFER_MIN_SIZE; + end_offset[index] = end; + binder_selftest_alloc_offset(alloc, end_offset, index + 1); + } +} + +/** + * binder_selftest_alloc() - Test alloc and free of buffer pages. + * @alloc: Pointer to alloc struct. + * + * Allocate BUFFER_NUM buffers to cover all page alignment cases, + * then free them in all orders possible. Check that pages are + * allocated after buffer alloc and freed after freeing buffer. + */ +void binder_selftest_alloc(struct binder_alloc *alloc) +{ + size_t end_offset[BUFFER_NUM]; + + if (!binder_selftest_run) + return; + mutex_lock(&binder_selftest_lock); + if (!binder_selftest_run || !alloc->vma) + goto done; + pr_info("STARTED\n"); + binder_selftest_alloc_offset(alloc, end_offset, 0); + binder_selftest_run = false; + if (binder_selftest_failures > 0) + pr_info("%d tests FAILED\n", binder_selftest_failures); + else + pr_info("PASSED\n"); + +done: + mutex_unlock(&binder_selftest_lock); +} From 7a6d4b157e1c046f4c4de9f2f1f1f7abfa6abb21 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Thu, 3 Aug 2017 11:33:53 -0700 Subject: [PATCH 1018/3220] FROMLIST: android: binder: Move buffer out of area shared with user space (from https://patchwork.kernel.org/patch/9928607/) Binder driver allocates buffer meta data in a region that is mapped in user space. These meta data contain pointers in the kernel. This patch allocates buffer meta data on the kernel heap that is not mapped in user space, and uses a pointer to refer to the data mapped. Also move alloc->buffers initialization from mmap to init since it's now used even when mmap failed or was not called. Bug: 36007193 Change-Id: Id5136048bdb7b796f59de066de7ea7df410498f5 Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 146 ++++++++++++++---------- drivers/android/binder_alloc.h | 2 +- drivers/android/binder_alloc_selftest.c | 11 +- 3 files changed, 91 insertions(+), 68 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 134c649e5a24..2ca2b02f82bb 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -62,9 +62,9 @@ static size_t binder_alloc_buffer_size(struct binder_alloc *alloc, struct binder_buffer *buffer) { if (list_is_last(&buffer->entry, &alloc->buffers)) - return alloc->buffer + - alloc->buffer_size - (void *)buffer->data; - return (size_t)binder_buffer_next(buffer) - (size_t)buffer->data; + return (u8 *)alloc->buffer + + alloc->buffer_size - (u8 *)buffer->data; + return (u8 *)binder_buffer_next(buffer)->data - (u8 *)buffer->data; } static void binder_insert_free_buffer(struct binder_alloc *alloc, @@ -114,9 +114,9 @@ static void binder_insert_allocated_buffer_locked( buffer = rb_entry(parent, struct binder_buffer, rb_node); BUG_ON(buffer->free); - if (new_buffer < buffer) + if (new_buffer->data < buffer->data) p = &parent->rb_left; - else if (new_buffer > buffer) + else if (new_buffer->data > buffer->data) p = &parent->rb_right; else BUG(); @@ -131,18 +131,17 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked( { struct rb_node *n = alloc->allocated_buffers.rb_node; struct binder_buffer *buffer; - struct binder_buffer *kern_ptr; + void *kern_ptr; - kern_ptr = (struct binder_buffer *)(user_ptr - alloc->user_buffer_offset - - offsetof(struct binder_buffer, data)); + kern_ptr = (void *)(user_ptr - alloc->user_buffer_offset); while (n) { buffer = rb_entry(n, struct binder_buffer, rb_node); BUG_ON(buffer->free); - if (kern_ptr < buffer) + if (kern_ptr < buffer->data) n = n->rb_left; - else if (kern_ptr > buffer) + else if (kern_ptr > buffer->data) n = n->rb_right; else { /* @@ -330,6 +329,9 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, return ERR_PTR(-ENOSPC); } + /* Pad 0-size buffers so they get assigned unique addresses */ + size = max(size, sizeof(void *)); + while (n) { buffer = rb_entry(n, struct binder_buffer, rb_node); BUG_ON(!buffer->free); @@ -389,14 +391,9 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, has_page_addr = (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK); - if (n == NULL) { - if (size + sizeof(struct binder_buffer) + 4 >= buffer_size) - buffer_size = size; /* no room for other buffers */ - else - buffer_size = size + sizeof(struct binder_buffer); - } + WARN_ON(n && buffer_size != size); end_page_addr = - (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size); + (void *)PAGE_ALIGN((uintptr_t)buffer->data + size); if (end_page_addr > has_page_addr) end_page_addr = has_page_addr; ret = binder_update_page_range(alloc, 1, @@ -404,17 +401,25 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, if (ret) return ERR_PTR(ret); - rb_erase(best_fit, &alloc->free_buffers); - buffer->free = 0; - buffer->free_in_progress = 0; - binder_insert_allocated_buffer_locked(alloc, buffer); if (buffer_size != size) { - struct binder_buffer *new_buffer = (void *)buffer->data + size; + struct binder_buffer *new_buffer; + new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); + if (!new_buffer) { + pr_err("%s: %d failed to alloc new buffer struct\n", + __func__, alloc->pid); + goto err_alloc_buf_struct_failed; + } + new_buffer->data = (u8 *)buffer->data + size; list_add(&new_buffer->entry, &buffer->entry); new_buffer->free = 1; binder_insert_free_buffer(alloc, new_buffer); } + + rb_erase(best_fit, &alloc->free_buffers); + buffer->free = 0; + buffer->free_in_progress = 0; + binder_insert_allocated_buffer_locked(alloc, buffer); binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: binder_alloc_buf size %zd got %pK\n", alloc->pid, size, buffer); @@ -429,6 +434,12 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, alloc->pid, size, alloc->free_async_space); } return buffer; + +err_alloc_buf_struct_failed: + binder_update_page_range(alloc, 0, + (void *)PAGE_ALIGN((uintptr_t)buffer->data), + end_page_addr, NULL); + return ERR_PTR(-ENOMEM); } /** @@ -463,56 +474,59 @@ struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, static void *buffer_start_page(struct binder_buffer *buffer) { - return (void *)((uintptr_t)buffer & PAGE_MASK); + return (void *)((uintptr_t)buffer->data & PAGE_MASK); } -static void *buffer_end_page(struct binder_buffer *buffer) +static void *prev_buffer_end_page(struct binder_buffer *buffer) { - return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK); + return (void *)(((uintptr_t)(buffer->data) - 1) & PAGE_MASK); } static void binder_delete_free_buffer(struct binder_alloc *alloc, struct binder_buffer *buffer) { struct binder_buffer *prev, *next = NULL; - int free_page_end = 1; - int free_page_start = 1; - + bool to_free = true; BUG_ON(alloc->buffers.next == &buffer->entry); prev = binder_buffer_prev(buffer); BUG_ON(!prev->free); - if (buffer_end_page(prev) == buffer_start_page(buffer)) { - free_page_start = 0; - if (buffer_end_page(prev) == buffer_end_page(buffer)) - free_page_end = 0; + if (prev_buffer_end_page(prev) == buffer_start_page(buffer)) { + to_free = false; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %pK share page with %pK\n", - alloc->pid, buffer, prev); + "%d: merge free, buffer %pK share page with %pK\n", + alloc->pid, buffer->data, prev->data); } if (!list_is_last(&buffer->entry, &alloc->buffers)) { next = binder_buffer_next(buffer); - if (buffer_start_page(next) == buffer_end_page(buffer)) { - free_page_end = 0; - if (buffer_start_page(next) == - buffer_start_page(buffer)) - free_page_start = 0; + if (buffer_start_page(next) == buffer_start_page(buffer)) { + to_free = false; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %pK share page with %pK\n", - alloc->pid, buffer, prev); + "%d: merge free, buffer %pK share page with %pK\n", + alloc->pid, + buffer->data, + next->data); } } - list_del(&buffer->entry); - if (free_page_start || free_page_end) { + + if (PAGE_ALIGNED(buffer->data)) { binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %pK do not share page%s%s with %pK or %pK\n", - alloc->pid, buffer, free_page_start ? "" : " end", - free_page_end ? "" : " start", prev, next); - binder_update_page_range(alloc, 0, free_page_start ? - buffer_start_page(buffer) : buffer_end_page(buffer), - (free_page_end ? buffer_end_page(buffer) : - buffer_start_page(buffer)) + PAGE_SIZE, NULL); + "%d: merge free, buffer start %pK is page aligned\n", + alloc->pid, buffer->data); + to_free = false; } + + if (to_free) { + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %pK do not share page with %pK or %pK\n", + alloc->pid, buffer->data, + prev->data, next->data); + binder_update_page_range(alloc, 0, buffer_start_page(buffer), + buffer_start_page(buffer) + PAGE_SIZE, + NULL); + } + list_del(&buffer->entry); + kfree(buffer); } static void binder_free_buf_locked(struct binder_alloc *alloc, @@ -533,8 +547,8 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, BUG_ON(buffer->free); BUG_ON(size > buffer_size); BUG_ON(buffer->transaction != NULL); - BUG_ON((void *)buffer < alloc->buffer); - BUG_ON((void *)buffer > alloc->buffer + alloc->buffer_size); + BUG_ON(buffer->data < alloc->buffer); + BUG_ON(buffer->data > alloc->buffer + alloc->buffer_size); if (buffer->async_transaction) { alloc->free_async_space += size + sizeof(struct binder_buffer); @@ -646,14 +660,14 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, } alloc->buffer_size = vma->vm_end - vma->vm_start; - if (binder_update_page_range(alloc, 1, alloc->buffer, - alloc->buffer + PAGE_SIZE, vma)) { + buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); + if (!buffer) { ret = -ENOMEM; - failure_string = "alloc small buf"; - goto err_alloc_small_buf_failed; + failure_string = "alloc buffer struct"; + goto err_alloc_buf_struct_failed; } - buffer = alloc->buffer; - INIT_LIST_HEAD(&alloc->buffers); + + buffer->data = alloc->buffer; list_add(&buffer->entry, &alloc->buffers); buffer->free = 1; binder_insert_free_buffer(alloc, buffer); @@ -664,7 +678,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, return 0; -err_alloc_small_buf_failed: +err_alloc_buf_struct_failed: kfree(alloc->pages); alloc->pages = NULL; err_alloc_pages_failed: @@ -684,14 +698,13 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) { struct rb_node *n; int buffers, page_count; + struct binder_buffer *buffer; BUG_ON(alloc->vma); buffers = 0; mutex_lock(&alloc->mutex); while ((n = rb_first(&alloc->allocated_buffers))) { - struct binder_buffer *buffer; - buffer = rb_entry(n, struct binder_buffer, rb_node); /* Transaction should already have been freed */ @@ -701,6 +714,16 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) buffers++; } + while (!list_empty(&alloc->buffers)) { + buffer = list_first_entry(&alloc->buffers, + struct binder_buffer, entry); + WARN_ON(!buffer->free); + + list_del(&buffer->entry); + WARN_ON_ONCE(!list_empty(&alloc->buffers)); + kfree(buffer); + } + page_count = 0; if (alloc->pages) { int i; @@ -804,5 +827,6 @@ void binder_alloc_init(struct binder_alloc *alloc) alloc->tsk = current->group_leader; alloc->pid = current->group_leader->pid; mutex_init(&alloc->mutex); + INIT_LIST_HEAD(&alloc->buffers); } diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 4f02cc084c15..dd5649bf6469 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -57,7 +57,7 @@ struct binder_buffer { size_t data_size; size_t offsets_size; size_t extra_buffers_size; - uint8_t data[0]; + void *data; }; /** diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index cc00ab6ee29d..0bf72079a9da 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -105,8 +105,9 @@ static bool check_buffer_pages_allocated(struct binder_alloc *alloc, void *page_addr, *end; int page_index; - end = (void *)PAGE_ALIGN((uintptr_t)buffer + size); - for (page_addr = buffer; page_addr < end; page_addr += PAGE_SIZE) { + end = (void *)PAGE_ALIGN((uintptr_t)buffer->data + size); + page_addr = buffer->data; + for (; page_addr < end; page_addr += PAGE_SIZE) { page_index = (page_addr - alloc->buffer) / PAGE_SIZE; if (!alloc->pages[page_index]) { pr_err("incorrect alloc state at page index %d\n", @@ -209,8 +210,7 @@ static void binder_selftest_alloc_size(struct binder_alloc *alloc, * Only BUFFER_NUM - 1 buffer sizes are adjustable since * we need one giant buffer before getting to the last page. */ - back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1] - - sizeof(struct binder_buffer) * BUFFER_NUM; + back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1]; binder_selftest_free_seq(alloc, front_sizes, seq, 0); binder_selftest_free_seq(alloc, back_sizes, seq, 0); } @@ -228,8 +228,7 @@ static void binder_selftest_alloc_offset(struct binder_alloc *alloc, prev = index == 0 ? 0 : end_offset[index - 1]; end = prev; - BUILD_BUG_ON((BUFFER_MIN_SIZE + sizeof(struct binder_buffer)) - * BUFFER_NUM >= PAGE_SIZE); + BUILD_BUG_ON(BUFFER_MIN_SIZE * BUFFER_NUM >= PAGE_SIZE); for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) { if (align % 2) From f73e8e762516ed18aa84084c1c2e7ddde3c2213a Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Sat, 29 Jul 2017 13:24:11 -0700 Subject: [PATCH 1019/3220] FROMLIST: android: binder: Add global lru shrinker to binder (from https://patchwork.kernel.org/patch/9928615/) Hold on to the pages allocated and mapped for transaction buffers until the system is under memory pressure. When that happens, use linux shrinker to free pages. Without using shrinker, patch "android: binder: Move buffer out of area shared with user space" will cause a significant slow down for small transactions that fit into the first page because free list buffer header used to be inlined with buffer data. In addition to prevent the performance regression for small transactions, this patch improves the performance for transactions that take up more than one page. Modify alloc selftest to work with the shrinker change. Test: Run memory intensive applications (Chrome and Camera) to trigger shrinker callbacks. Binder frees memory as expected. Test: Run binderThroughputTest with high memory pressure option enabled. Bug: 63926541 Change-Id: I3abfc43b405e7e0a6228da37e0689a4b944f0e00 Signed-off-by: Sherry Yang --- drivers/android/binder.c | 2 + drivers/android/binder_alloc.c | 173 ++++++++++++++++++++---- drivers/android/binder_alloc.h | 23 +++- drivers/android/binder_alloc_selftest.c | 68 ++++++++-- 4 files changed, 226 insertions(+), 40 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 75cbbd2e986a..3ce96db8e5d4 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5623,6 +5623,8 @@ static int __init binder_init(void) struct binder_device *device; struct hlist_node *tmp; + binder_alloc_shrinker_init(); + atomic_set(&binder_transaction_log.cur, ~0U); atomic_set(&binder_transaction_log_failed.cur, ~0U); binder_deferred_workqueue = create_singlethread_workqueue("binder"); diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 2ca2b02f82bb..bcdaf684a658 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -27,9 +27,12 @@ #include #include #include +#include #include "binder_alloc.h" #include "binder_trace.h" +struct list_lru binder_alloc_lru; + static DEFINE_MUTEX(binder_alloc_mmap_lock); enum { @@ -188,8 +191,9 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, { void *page_addr; unsigned long user_page_addr; - struct page **page; - struct mm_struct *mm; + struct binder_lru_page *page; + struct mm_struct *mm = NULL; + bool need_mm = false; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: %s pages %pK-%pK\n", alloc->pid, @@ -200,9 +204,18 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, trace_binder_update_page_range(alloc, allocate, start, end); - if (vma) - mm = NULL; - else + if (allocate == 0) + goto free_range; + + for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { + page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; + if (!page->page_ptr) { + need_mm = true; + break; + } + } + + if (!vma && need_mm) mm = get_task_mm(alloc->tsk); if (mm) { @@ -215,10 +228,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (allocate == 0) - goto free_range; - - if (vma == NULL) { + if (!vma && need_mm) { pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n", alloc->pid); goto err_no_vma; @@ -226,18 +236,33 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; + bool on_lru; page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; - BUG_ON(*page); - *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); - if (*page == NULL) { + if (page->page_ptr) { + on_lru = list_lru_del(&binder_alloc_lru, &page->lru); + WARN_ON(!on_lru); + continue; + } + + if (WARN_ON(!vma)) + goto err_page_ptr_cleared; + + page->page_ptr = alloc_page(GFP_KERNEL | + __GFP_HIGHMEM | + __GFP_ZERO); + if (!page->page_ptr) { pr_err("%d: binder_alloc_buf failed for page at %pK\n", alloc->pid, page_addr); goto err_alloc_page_failed; } + page->alloc = alloc; + INIT_LIST_HEAD(&page->lru); + ret = map_kernel_range_noflush((unsigned long)page_addr, - PAGE_SIZE, PAGE_KERNEL, page); + PAGE_SIZE, PAGE_KERNEL, + &page->page_ptr); flush_cache_vmap((unsigned long)page_addr, (unsigned long)page_addr + PAGE_SIZE); if (ret != 1) { @@ -247,7 +272,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } user_page_addr = (uintptr_t)page_addr + alloc->user_buffer_offset; - ret = vm_insert_page(vma, user_page_addr, page[0]); + ret = vm_insert_page(vma, user_page_addr, page[0].page_ptr); if (ret) { pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n", alloc->pid, user_page_addr); @@ -264,16 +289,21 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, free_range: for (page_addr = end - PAGE_SIZE; page_addr >= start; page_addr -= PAGE_SIZE) { + bool ret; + page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; - if (vma) - zap_page_range(vma, (uintptr_t)page_addr + - alloc->user_buffer_offset, PAGE_SIZE, NULL); + + ret = list_lru_add(&binder_alloc_lru, &page->lru); + WARN_ON(!ret); + continue; + err_vm_insert_page_failed: unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); err_map_kernel_failed: - __free_page(*page); - *page = NULL; + __free_page(page->page_ptr); + page->page_ptr = NULL; err_alloc_page_failed: +err_page_ptr_cleared: ; } err_no_vma: @@ -730,16 +760,20 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { void *page_addr; + bool on_lru; - if (!alloc->pages[i]) + if (!alloc->pages[i].page_ptr) continue; + on_lru = list_lru_del(&binder_alloc_lru, + &alloc->pages[i].lru); page_addr = alloc->buffer + i * PAGE_SIZE; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%s: %d: page %d at %pK not freed\n", - __func__, alloc->pid, i, page_addr); + "%s: %d: page %d at %pK %s\n", + __func__, alloc->pid, i, page_addr, + on_lru ? "on lru" : "active"); unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); - __free_page(alloc->pages[i]); + __free_page(alloc->pages[i].page_ptr); page_count++; } kfree(alloc->pages); @@ -815,6 +849,94 @@ void binder_alloc_vma_close(struct binder_alloc *alloc) WRITE_ONCE(alloc->vma_vm_mm, NULL); } +/** + * binder_alloc_free_page() - shrinker callback to free pages + * @item: item to free + * @lock: lock protecting the item + * @cb_arg: callback argument + * + * Called from list_lru_walk() in binder_shrink_scan() to free + * up pages when the system is under memory pressure. + */ +enum lru_status binder_alloc_free_page(struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lock, + void *cb_arg) +{ + struct mm_struct *mm = NULL; + struct binder_lru_page *page = container_of(item, + struct binder_lru_page, + lru); + struct binder_alloc *alloc; + uintptr_t page_addr; + size_t index; + + alloc = page->alloc; + if (!mutex_trylock(&alloc->mutex)) + goto err_get_alloc_mutex_failed; + + if (!page->page_ptr) + goto err_page_already_freed; + + index = page - alloc->pages; + page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; + if (alloc->vma) { + mm = get_task_mm(alloc->tsk); + if (!mm) + goto err_get_task_mm_failed; + if (!down_write_trylock(&mm->mmap_sem)) + goto err_down_write_mmap_sem_failed; + + zap_page_range(alloc->vma, + page_addr + + alloc->user_buffer_offset, + PAGE_SIZE, NULL); + + up_write(&mm->mmap_sem); + mmput(mm); + } + + unmap_kernel_range(page_addr, PAGE_SIZE); + __free_page(page->page_ptr); + page->page_ptr = NULL; + + list_lru_isolate(lru, item); + + mutex_unlock(&alloc->mutex); + return LRU_REMOVED; + +err_down_write_mmap_sem_failed: + mmput(mm); +err_get_task_mm_failed: +err_page_already_freed: + mutex_unlock(&alloc->mutex); +err_get_alloc_mutex_failed: + return LRU_SKIP; +} + +static unsigned long +binder_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long ret = list_lru_count(&binder_alloc_lru); + return ret; +} + +static unsigned long +binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long ret; + + ret = list_lru_walk(&binder_alloc_lru, binder_alloc_free_page, + NULL, sc->nr_to_scan); + return ret; +} + +struct shrinker binder_shrinker = { + .count_objects = binder_shrink_count, + .scan_objects = binder_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; + /** * binder_alloc_init() - called by binder_open() for per-proc initialization * @alloc: binder_alloc for this proc @@ -830,3 +952,8 @@ void binder_alloc_init(struct binder_alloc *alloc) INIT_LIST_HEAD(&alloc->buffers); } +void binder_alloc_shrinker_init(void) +{ + list_lru_init(&binder_alloc_lru); + register_shrinker(&binder_shrinker); +} diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index dd5649bf6469..fa707cc63393 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -21,7 +21,9 @@ #include #include #include +#include +extern struct list_lru binder_alloc_lru; struct binder_transaction; /** @@ -60,6 +62,18 @@ struct binder_buffer { void *data; }; +/** + * struct binder_lru_page - page object used for binder shrinker + * @page_ptr: pointer to physical page in mmap'd space + * @lru: entry in binder_alloc_lru + * @alloc: binder_alloc for a proc + */ +struct binder_lru_page { + struct list_head lru; + struct page *page_ptr; + struct binder_alloc *alloc; +}; + /** * struct binder_alloc - per-binder proc state for binder allocator * @vma: vm_area_struct passed to mmap_handler @@ -75,8 +89,7 @@ struct binder_buffer { * @allocated_buffers: rb tree of allocated buffers sorted by address * @free_async_space: VA space available for async buffers. This is * initialized at mmap time to 1/2 the full VA space - * @pages: array of physical page addresses for each - * page of mmap'd space + * @pages: array of binder_lru_page * @buffer_size: size of address space specified via mmap * @pid: pid for associated binder_proc (invariant after init) * @@ -96,7 +109,7 @@ struct binder_alloc { struct rb_root free_buffers; struct rb_root allocated_buffers; size_t free_async_space; - struct page **pages; + struct binder_lru_page *pages; size_t buffer_size; uint32_t buffer_free; int pid; @@ -107,12 +120,16 @@ void binder_selftest_alloc(struct binder_alloc *alloc); #else static inline void binder_selftest_alloc(struct binder_alloc *alloc) {} #endif +enum lru_status binder_alloc_free_page(struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lock, void *cb_arg); extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, size_t data_size, size_t offsets_size, size_t extra_buffers_size, int is_async); extern void binder_alloc_init(struct binder_alloc *alloc); +void binder_alloc_shrinker_init(void); extern void binder_alloc_vma_close(struct binder_alloc *alloc); extern struct binder_buffer * binder_alloc_prepare_to_free(struct binder_alloc *alloc, diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index 0bf72079a9da..8bd7bcef967d 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -109,9 +109,11 @@ static bool check_buffer_pages_allocated(struct binder_alloc *alloc, page_addr = buffer->data; for (; page_addr < end; page_addr += PAGE_SIZE) { page_index = (page_addr - alloc->buffer) / PAGE_SIZE; - if (!alloc->pages[page_index]) { - pr_err("incorrect alloc state at page index %d\n", - page_index); + if (!alloc->pages[page_index].page_ptr || + !list_empty(&alloc->pages[page_index].lru)) { + pr_err("expect alloc but is %s at page index %d\n", + alloc->pages[page_index].page_ptr ? + "lru" : "free", page_index); return false; } } @@ -137,28 +139,63 @@ static void binder_selftest_alloc_buf(struct binder_alloc *alloc, static void binder_selftest_free_buf(struct binder_alloc *alloc, struct binder_buffer *buffers[], - size_t *sizes, int *seq) + size_t *sizes, int *seq, size_t end) { int i; for (i = 0; i < BUFFER_NUM; i++) binder_alloc_free_buf(alloc, buffers[seq[i]]); + for (i = 0; i < end / PAGE_SIZE; i++) { + /** + * Error message on a free page can be false positive + * if binder shrinker ran during binder_alloc_free_buf + * calls above. + */ + if (list_empty(&alloc->pages[i].lru)) { + pr_err_size_seq(sizes, seq); + pr_err("expect lru but is %s at page index %d\n", + alloc->pages[i].page_ptr ? "alloc" : "free", i); + binder_selftest_failures++; + } + } +} + +static void binder_selftest_free_page(struct binder_alloc *alloc) +{ + int i; + unsigned long count; + + while ((count = list_lru_count(&binder_alloc_lru))) { + list_lru_walk(&binder_alloc_lru, binder_alloc_free_page, + NULL, count); + } + for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) { - if ((!alloc->pages[i]) == (i == 0)) { - pr_err("incorrect free state at page index %d\n", i); + if (alloc->pages[i].page_ptr) { + pr_err("expect free but is %s at page index %d\n", + list_empty(&alloc->pages[i].lru) ? + "alloc" : "lru", i); binder_selftest_failures++; } } } static void binder_selftest_alloc_free(struct binder_alloc *alloc, - size_t *sizes, int *seq) + size_t *sizes, int *seq, size_t end) { struct binder_buffer *buffers[BUFFER_NUM]; binder_selftest_alloc_buf(alloc, buffers, sizes, seq); - binder_selftest_free_buf(alloc, buffers, sizes, seq); + binder_selftest_free_buf(alloc, buffers, sizes, seq, end); + + /* Allocate from lru. */ + binder_selftest_alloc_buf(alloc, buffers, sizes, seq); + if (list_lru_count(&binder_alloc_lru)) + pr_err("lru list should be empty but is not\n"); + + binder_selftest_free_buf(alloc, buffers, sizes, seq, end); + binder_selftest_free_page(alloc); } static bool is_dup(int *seq, int index, int val) @@ -174,19 +211,20 @@ static bool is_dup(int *seq, int index, int val) /* Generate BUFFER_NUM factorial free orders. */ static void binder_selftest_free_seq(struct binder_alloc *alloc, - size_t *sizes, int *seq, int index) + size_t *sizes, int *seq, + int index, size_t end) { int i; if (index == BUFFER_NUM) { - binder_selftest_alloc_free(alloc, sizes, seq); + binder_selftest_alloc_free(alloc, sizes, seq, end); return; } for (i = 0; i < BUFFER_NUM; i++) { if (is_dup(seq, index, i)) continue; seq[index] = i; - binder_selftest_free_seq(alloc, sizes, seq, index + 1); + binder_selftest_free_seq(alloc, sizes, seq, index + 1, end); } } @@ -211,8 +249,9 @@ static void binder_selftest_alloc_size(struct binder_alloc *alloc, * we need one giant buffer before getting to the last page. */ back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1]; - binder_selftest_free_seq(alloc, front_sizes, seq, 0); - binder_selftest_free_seq(alloc, back_sizes, seq, 0); + binder_selftest_free_seq(alloc, front_sizes, seq, 0, + end_offset[BUFFER_NUM - 1]); + binder_selftest_free_seq(alloc, back_sizes, seq, 0, alloc->buffer_size); } static void binder_selftest_alloc_offset(struct binder_alloc *alloc, @@ -246,7 +285,8 @@ static void binder_selftest_alloc_offset(struct binder_alloc *alloc, * * Allocate BUFFER_NUM buffers to cover all page alignment cases, * then free them in all orders possible. Check that pages are - * allocated after buffer alloc and freed after freeing buffer. + * correctly allocated, put onto lru when buffers are freed, and + * are freed when binder_alloc_free_page is called. */ void binder_selftest_alloc(struct binder_alloc *alloc) { From 850d57dceae07291ec4bf6c55e7a6433bbe7bf30 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Wed, 2 Aug 2017 14:02:37 -0700 Subject: [PATCH 1020/3220] FROMLIST: android: binder: Add shrinker tracepoints (from https://patchwork.kernel.org/patch/9928613/) Add tracepoints in binder transaction allocator to record lru hits and alloc/free page. Bug: 63926541 Change-Id: I2e24fe8e7b6534349df4a87ff865a6843ac9a30b Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 27 +++++++++++++++-- drivers/android/binder_trace.h | 55 ++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index bcdaf684a658..28dc76473ff6 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -237,18 +237,25 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; bool on_lru; + size_t index; - page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; + index = (page_addr - alloc->buffer) / PAGE_SIZE; + page = &alloc->pages[index]; if (page->page_ptr) { + trace_binder_alloc_lru_start(alloc, index); + on_lru = list_lru_del(&binder_alloc_lru, &page->lru); WARN_ON(!on_lru); + + trace_binder_alloc_lru_end(alloc, index); continue; } if (WARN_ON(!vma)) goto err_page_ptr_cleared; + trace_binder_alloc_page_start(alloc, index); page->page_ptr = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); @@ -278,6 +285,8 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, alloc->pid, user_page_addr); goto err_vm_insert_page_failed; } + + trace_binder_alloc_page_end(alloc, index); /* vm_insert_page does not seem to increment the refcount */ } if (mm) { @@ -290,11 +299,17 @@ free_range: for (page_addr = end - PAGE_SIZE; page_addr >= start; page_addr -= PAGE_SIZE) { bool ret; + size_t index; - page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; + index = (page_addr - alloc->buffer) / PAGE_SIZE; + page = &alloc->pages[index]; + + trace_binder_free_lru_start(alloc, index); ret = list_lru_add(&binder_alloc_lru, &page->lru); WARN_ON(!ret); + + trace_binder_free_lru_end(alloc, index); continue; err_vm_insert_page_failed: @@ -887,19 +902,27 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; + trace_binder_unmap_user_start(alloc, index); + zap_page_range(alloc->vma, page_addr + alloc->user_buffer_offset, PAGE_SIZE, NULL); + trace_binder_unmap_user_end(alloc, index); + up_write(&mm->mmap_sem); mmput(mm); } + trace_binder_unmap_kernel_start(alloc, index); + unmap_kernel_range(page_addr, PAGE_SIZE); __free_page(page->page_ptr); page->page_ptr = NULL; + trace_binder_unmap_kernel_end(alloc, index); + list_lru_isolate(lru, item); mutex_unlock(&alloc->mutex); diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index 7967db16ba5a..76e3b9c8a8a2 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -291,6 +291,61 @@ TRACE_EVENT(binder_update_page_range, __entry->offset, __entry->size) ); +DECLARE_EVENT_CLASS(binder_lru_page_class, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index), + TP_STRUCT__entry( + __field(int, proc) + __field(size_t, page_index) + ), + TP_fast_assign( + __entry->proc = alloc->pid; + __entry->page_index = page_index; + ), + TP_printk("proc=%d page_index=%zu", + __entry->proc, __entry->page_index) +); + +DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_start, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_end, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_free_lru_start, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_free_lru_end, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_start, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_end, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_start, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_end, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_start, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_end, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + TRACE_EVENT(binder_command, TP_PROTO(uint32_t cmd), TP_ARGS(cmd), From 798dfdd839e4401bd7ab9a5e107f890e5c82867a Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Tue, 22 Aug 2017 17:26:57 -0700 Subject: [PATCH 1021/3220] FROMLIST: android: binder: Add page usage in binder stats (from https://patchwork.kernel.org/patch/9928611/) Add the number of active, lru, and free pages for each binder process in binder stats Bug: 63926541 Change-Id: I12618e4eb8ecc08f4f05fe4cba454a88830897f9 Signed-off-by: Sherry Yang --- drivers/android/binder.c | 2 ++ drivers/android/binder_alloc.c | 28 ++++++++++++++++++++++++++++ drivers/android/binder_alloc.h | 2 ++ 3 files changed, 32 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 3ce96db8e5d4..799de03d75a8 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5427,6 +5427,8 @@ static void print_binder_proc_stats(struct seq_file *m, count = binder_alloc_get_allocated_count(&proc->alloc); seq_printf(m, " buffers: %d\n", count); + binder_alloc_print_pages(m, &proc->alloc); + count = 0; binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) { diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 28dc76473ff6..1c1a7d9c86ed 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -831,6 +831,34 @@ void binder_alloc_print_allocated(struct seq_file *m, mutex_unlock(&alloc->mutex); } +/** + * binder_alloc_print_pages() - print page usage + * @m: seq_file for output via seq_printf() + * @alloc: binder_alloc for this proc + */ +void binder_alloc_print_pages(struct seq_file *m, + struct binder_alloc *alloc) +{ + struct binder_lru_page *page; + int i; + int active = 0; + int lru = 0; + int free = 0; + + mutex_lock(&alloc->mutex); + for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { + page = &alloc->pages[i]; + if (!page->page_ptr) + free++; + else if (list_empty(&page->lru)) + active++; + else + lru++; + } + mutex_unlock(&alloc->mutex); + seq_printf(m, " pages: %d:%d:%d\n", active, lru, free); +} + /** * binder_alloc_get_allocated_count() - return count of buffers * @alloc: binder_alloc for this proc diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index fa707cc63393..a3a3602c689c 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -142,6 +142,8 @@ extern void binder_alloc_deferred_release(struct binder_alloc *alloc); extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc); extern void binder_alloc_print_allocated(struct seq_file *m, struct binder_alloc *alloc); +void binder_alloc_print_pages(struct seq_file *m, + struct binder_alloc *alloc); /** * binder_alloc_get_free_async_space() - get free space available for async From 26b37261ea25c6928c6a69e77a8f7d39ee3267c9 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Fri, 26 May 2017 11:11:47 -0700 Subject: [PATCH 1022/3220] sched: deadline: WALT: account cumulative runnable avg Account cumulative runnable average for WALT CPU utilization accounting. Change-Id: I56934894e626dec183740eeaf89a57d2ef638143 Signed-off-by: Joonwoo Park --- kernel/sched/deadline.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ec7f19b5020..2aae8b8b68e8 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -18,6 +18,8 @@ #include +#include "walt.h" + struct dl_bandwidth def_dl_bandwidth; static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) @@ -882,6 +884,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -896,6 +899,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); From 48f67ea85de468a9b3e47e723e7681cf7771dea6 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Fri, 26 May 2017 11:19:36 -0700 Subject: [PATCH 1023/3220] sched: WALT: fix broken cumulative runnable average accounting When running tasks's ravg.demand is changed update_history() adjusts rq->cumulative_runnable_avg to reflect change of CPU load. Currently this fixup is broken by accumulating task's new demand without subtracting the task's old demand. Fix the fixup logic to subtract the task's old demand. Change-Id: I61beb32a4850879ccb39b733f5564251e465bfeb Signed-off-by: Joonwoo Park --- kernel/sched/walt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 92c3aae8e056..166641ed1f39 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -111,8 +111,10 @@ walt_dec_cumulative_runnable_avg(struct rq *rq, static void fixup_cumulative_runnable_avg(struct rq *rq, - struct task_struct *p, s64 task_load_delta) + struct task_struct *p, u64 new_task_load) { + s64 task_load_delta = (s64)new_task_load - task_load(p); + rq->cumulative_runnable_avg += task_load_delta; if ((s64)rq->cumulative_runnable_avg < 0) panic("cra less than zero: tld: %lld, task_load(p) = %u\n", From ee4cebd75ed7b77132c39c0093923f9ff1bcafaa Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Thu, 8 Dec 2016 16:12:12 -0800 Subject: [PATCH 1024/3220] sched: EAS/WALT: use cr_avg instead of prev_runnable_sum WALT accounts two major statistics; CPU load and cumulative tasks demand. The CPU load which is account of accumulated each CPU's absolute execution time is for CPU frequency guidance. Whereas cumulative tasks demand which is each CPU's instantaneous load to reflect CPU's load at given time is for task placement decision. Use cumulative tasks demand for cpu_util() for task placement and introduce cpu_util_freq() for frequency guidance. Change-Id: Id928f01dbc8cb2a617cdadc584c1f658022565c5 Signed-off-by: Joonwoo Park --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 16 +++++++++++++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9307827cc7b1..4f97de8e0b18 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2992,7 +2992,7 @@ static void sched_freq_tick_pelt(int cpu) #ifdef CONFIG_SCHED_WALT static void sched_freq_tick_walt(int cpu) { - unsigned long cpu_utilization = cpu_util(cpu); + unsigned long cpu_utilization = cpu_util_freq(cpu); unsigned long capacity_curr = capacity_curr_of(cpu); if (walt_disabled || !sysctl_sched_use_walt_cpu_util) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c193e9b1c38f..3641dad3d4cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4659,7 +4659,7 @@ static inline void hrtick_update(struct rq *rq) static bool cpu_overutilized(int cpu); unsigned long boosted_cpu_util(int cpu); #else -#define boosted_cpu_util(cpu) cpu_util(cpu) +#define boosted_cpu_util(cpu) cpu_util_freq(cpu) #endif #ifdef CONFIG_SMP @@ -5937,7 +5937,7 @@ schedtune_task_margin(struct task_struct *task) unsigned long boosted_cpu_util(int cpu) { - unsigned long util = cpu_util(cpu); + unsigned long util = cpu_util_freq(cpu); long margin = schedtune_cpu_margin(util, cpu); trace_sched_boost_cpu(cpu, util, margin); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 029cf2bbeda2..73077f535e95 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1592,7 +1592,7 @@ static inline unsigned long __cpu_util(int cpu, int delta) #ifdef CONFIG_SCHED_WALT if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { - util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; + util = cpu_rq(cpu)->cumulative_runnable_avg << SCHED_LOAD_SHIFT; do_div(util, walt_ravg_window); } #endif @@ -1608,6 +1608,20 @@ static inline unsigned long cpu_util(int cpu) return __cpu_util(cpu, 0); } +static inline unsigned long cpu_util_freq(int cpu) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; + do_div(util, walt_ravg_window); + } +#endif + return (util >= capacity) ? capacity : util; +} + #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHED From 2d7da09705d6560c61676376be3ae05a5019600a Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Mon, 16 Jan 2017 18:09:20 -0800 Subject: [PATCH 1025/3220] sched: EAS: schedfreq: fix CPU util over estimation WALT CPU utilization reports CPU load of all the scheduler classes. Therefore adding RT class's load additionally will cause frequency overshooting. Fix such issue by not accounting RT class load when requesting capacity. Change-Id: I29600d7af7ca8c00e0d2ff1e13872024ccaa72bf Signed-off-by: Joonwoo Park --- kernel/sched/cpufreq_sched.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index f10d9f7d6d07..6ffb23adbcef 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -235,6 +235,18 @@ out: cpufreq_cpu_put(policy); } +#ifdef CONFIG_SCHED_WALT +static inline unsigned long +requested_capacity(struct sched_capacity_reqs *scr) +{ + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + return scr->cfs; + return scr->cfs + scr->rt; +} +#else +#define requested_capacity(scr) (scr->cfs + scr->rt) +#endif + void update_cpu_capacity_request(int cpu, bool request) { unsigned long new_capacity; @@ -245,7 +257,7 @@ void update_cpu_capacity_request(int cpu, bool request) scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - new_capacity = scr->cfs + scr->rt; + new_capacity = requested_capacity(scr); new_capacity = new_capacity * capacity_margin / SCHED_CAPACITY_SCALE; new_capacity += scr->dl; From c8b8c92bbc8954ae18cba7eb0c1f3a06a244129c Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Fri, 20 Jan 2017 11:10:15 -0800 Subject: [PATCH 1026/3220] sched: WALT: fix potential overflow Task demand and CPU util are in u64. Change-Id: If7ec1623e723026d3346201122aab0303a6d2ba2 Signed-off-by: Joonwoo Park --- kernel/sched/sched.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 73077f535e95..d4613c5be81d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1591,10 +1591,9 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long capacity = capacity_orig_of(cpu); #ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { - util = cpu_rq(cpu)->cumulative_runnable_avg << SCHED_LOAD_SHIFT; - do_div(util, walt_ravg_window); - } + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg, + walt_ravg_window >> SCHED_LOAD_SHIFT); #endif delta += util; if (delta < 0) @@ -1614,10 +1613,9 @@ static inline unsigned long cpu_util_freq(int cpu) unsigned long capacity = capacity_orig_of(cpu); #ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { - util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; - do_div(util, walt_ravg_window); - } + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + util = div64_u64(cpu_rq(cpu)->prev_runnable_sum, + walt_ravg_window >> SCHED_LOAD_SHIFT); #endif return (util >= capacity) ? capacity : util; } From f94958ffa75d4f18d6d6838629d71edee41103c5 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Wed, 25 Jan 2017 17:07:19 -0800 Subject: [PATCH 1027/3220] cpufreq: sched: WALT: don't apply capacity margin twice With WALT all the scheduler classes' load are accounted in scr->cfs and update_cpu_capacity_request() adds capacity margin. At present, at tick path, scheduler also adds capacity margin. Therefore the margin applied twice. Fix such error by using margin applied cpu utilization only for checking whether frequency increase is needed. Change-Id: Id7d8cc73b2e4eec70b274ca66e09bb0b16bf6f09 Signed-off-by: Joonwoo Park (trivial rebase conflict) Signed-off-by: Chris Redpath --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4f97de8e0b18..0b14b4634b8c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2999,13 +2999,13 @@ static void sched_freq_tick_walt(int cpu) return sched_freq_tick_pelt(cpu); /* - * Add a margin to the WALT utilization. + * Add a margin to the WALT utilization to check if we will need to + * increase frequency. * NOTE: WALT tracks a single CPU signal for all the scheduling * classes, thus this margin is going to be added to the DL class as * well, which is something we do not do in sched_freq_tick_pelt case. */ - cpu_utilization = add_capacity_margin(cpu_utilization); - if (cpu_utilization <= capacity_curr) + if (add_capacity_margin(cpu_utilization) <= capacity_curr) return; /* From 94e5c965075b55a5dfd1c4cce580e2dfb0c7ffc3 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Wed, 25 Jan 2017 17:45:56 -0800 Subject: [PATCH 1028/3220] sched: EAS/WALT: take into account of waking task's load WALT's function cpu_util(cpu) reports CPU's load without taking into account of waking task's load. Thus currently cpu_overutilized() underestimates load on the previous CPU of waking task. Take into account of task's load to determine whether previous CPU is overutilzed to bail out early without running energy_diff() which is expensive. Change-Id: I30f146984a880ad2cc1b8a4ce35bd239a8c9a607 Signed-off-by: Joonwoo Park (minor rebase conflicts) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3641dad3d4cc..94bb5786d8a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4656,6 +4656,7 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP +static bool __cpu_overutilized(int cpu, int delta); static bool cpu_overutilized(int cpu); unsigned long boosted_cpu_util(int cpu); #else @@ -5856,9 +5857,14 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } +static bool __cpu_overutilized(int cpu, int delta) +{ + return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin); +} + static bool cpu_overutilized(int cpu) { - return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); + return __cpu_overutilized(cpu, 0); } #ifdef CONFIG_SCHED_TUNE @@ -6577,6 +6583,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync } if (target_cpu != prev_cpu) { + int delta = 0; struct energy_env eenv = { .util_delta = task_util(p), .src_cpu = prev_cpu, @@ -6584,8 +6591,13 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync .task = p, }; + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + delta = task_util(p); +#endif /* Not enough spare capacity on previous cpu */ - if (cpu_overutilized(prev_cpu)) { + if (__cpu_overutilized(prev_cpu, delta)) { schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap); schedstat_inc(this_rq(), eas_stats.secb_insuff_cap); goto unlock; From 11b618a0b2fc8598474a518add7037d4005d83e8 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Mon, 24 Jul 2017 13:23:05 +0100 Subject: [PATCH 1029/3220] sched: EAS: fix incorrect energy delta calculation due to rounding error In order to calculate energy difference we currently iterates CPUs under the same sched doamin to accumulate total energy cost and compare before and after : for_each_domain(cpu) total_energy_before += (cpu_util * power) >> SCHED_CAPACITY_SHIFT; for_each_domain(cpu) total_energy_after += (cpu_util * power) >> SCHED_CAPACITY_SHIFT; Doing such can incorrectly calculate and report abs(delta) > 0 when there is actually no energy delta between before and after because the same total accumulated cpu_util of all the CPUs can be distributed differently before and after and it causes different amount of rounding error. Fix such incorrectness by shifting just once with accumulated total_energy. Change-Id: I82f1e2e358367058960938b4ef81714f57e921cf Signed-off-by: Joonwoo Park (moved part to another commit) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94bb5786d8a7..61c8952c7aaf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5473,10 +5473,8 @@ end: */ static int sched_group_energy(struct energy_env *eenv) { - struct sched_domain *sd; - int cpu, total_energy = 0; struct cpumask visit_cpus; - struct sched_group *sg; + u64 total_energy = 0; WARN_ON(!eenv->sg_top->sge); @@ -5484,8 +5482,8 @@ static int sched_group_energy(struct energy_env *eenv) while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; - - cpu = cpumask_first(&visit_cpus); + int cpu = cpumask_first(&visit_cpus); + struct sched_domain *sd; /* * Is the group utilization affected by cpus outside this @@ -5497,7 +5495,7 @@ static int sched_group_energy(struct energy_env *eenv) sg_shared_cap = sd->parent->groups; for_each_domain(cpu, sd) { - sg = sd->groups; + struct sched_group *sg = sd->groups; /* Has this sched_domain already been visited? */ if (sd->child && group_first_cpu(sg) != cpu) @@ -5533,11 +5531,9 @@ static int sched_group_energy(struct energy_env *eenv) idle_idx = group_idle_state(eenv, sg); group_util = group_norm_util(eenv, sg); - sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) - >> SCHED_CAPACITY_SHIFT; + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power); sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) - * sg->sge->idle_states[idle_idx].power) - >> SCHED_CAPACITY_SHIFT; + * sg->sge->idle_states[idle_idx].power); total_energy += sg_busy_energy + sg_idle_energy; @@ -5562,7 +5558,7 @@ next_cpu: continue; } - eenv->energy = total_energy; + eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT; return 0; } From 3989a247e2744f437233a36a0183edf3b3dfc8f1 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Tue, 7 Mar 2017 18:08:24 -0800 Subject: [PATCH 1030/3220] sched: EAS: kill incorrect nohz idle cpu kick EAS won't allow NOHZ idle balancer until CPU's over utilized. However nohz_kick_needed() can return true. This causes idle CPU wake up for nothing. Change-Id: I6e548442e29e4f85cda695e4c7101dd591b12fe6 Signed-off-by: Joonwoo Park --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61c8952c7aaf..0f43ece69e8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9768,12 +9768,12 @@ static inline bool nohz_kick_needed(struct rq *rq) return true; /* Do idle load balance if there have misfit task */ - if (energy_aware() && rq->misfit_task) - return true; + if (energy_aware()) + return rq->misfit_task; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd && !energy_aware()) { + if (sd) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); From 0caf1df0c520325c9352149795266709e989b222 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Tue, 16 May 2017 11:13:00 -0700 Subject: [PATCH 1031/3220] sched: WALT: fix window mis-alignment The initial window start needs to be close to ktime ns = 0 to be aligned with scheduler tick. Change-Id: Ia91f74efce2f910106622a054a6fcd507e763ca5 Signed-off-by: Joonwoo Park --- kernel/sched/walt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 166641ed1f39..28e999554463 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -804,11 +804,11 @@ void walt_set_window_start(struct rq *rq) int cpu = cpu_of(rq); struct rq *sync_rq = cpu_rq(sync_cpu); - if (rq->window_start) + if (likely(rq->window_start)) return; if (cpu == sync_cpu) { - rq->window_start = walt_ktime_clock(); + rq->window_start = 1; } else { raw_spin_unlock(&rq->lock); double_rq_lock(rq, sync_rq); From ab10c4d8d6f5a42665e61acec78d64e021f4b68c Mon Sep 17 00:00:00 2001 From: Xu YiPing Date: Mon, 22 May 2017 11:26:23 -0700 Subject: [PATCH 1032/3220] FROMLIST: binder: fix memory corruption in binder_transaction binder (from https://patchwork.kernel.org/patch/9939405/) commit 7a4408c6bd3e ("binder: make sure accesses to proc/thread are safe") made a change to enqueue tcomplete to thread->todo before enqueuing the transaction. However, in err_dead_proc_or_thread case, the tcomplete is directly freed, without dequeued. It may cause the thread->todo list to be corrupted. So, dequeue it before freeing. Bug: 65333488 Change-Id: Id063a4db18deaa634f4d44aa6ebca47bea32537a Signed-off-by: Xu YiPing Signed-off-by: Todd Kjos --- drivers/android/binder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 799de03d75a8..810ff70104c1 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3253,6 +3253,7 @@ static void binder_transaction(struct binder_proc *proc, err_dead_proc_or_thread: return_error = BR_DEAD_REPLY; return_error_line = __LINE__; + binder_dequeue_work(proc, tcomplete); err_translate_failed: err_bad_object_type: err_bad_offset: From b13d0fb3390670995a7fc90f96c17542e305ca59 Mon Sep 17 00:00:00 2001 From: Xu YiPing Date: Tue, 5 Sep 2017 10:00:59 -0700 Subject: [PATCH 1033/3220] FROMLIST: binder: fix an ret value override (from https://patchwork.kernel.org/patch/9939409/) commit 372e3147df70 ("binder: guarantee txn complete / errors delivered in-order") incorrectly defined a local ret value. This ret value will be invalid when out of the if block Change-Id: If7bd963ac7e67d135aa949133263aac27bf15d1a Signed-off-by: Xu YiPing Signed-off-by: Todd Kjos --- drivers/android/binder.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 810ff70104c1..7c9f9dde8397 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2481,7 +2481,6 @@ static int binder_translate_handle(struct flat_binder_object *fp, (u64)node->ptr); binder_node_unlock(node); } else { - int ret; struct binder_ref_data dest_rdata; binder_node_unlock(node); From 3482bbea6b3e9ed685ce42a77927dfeaec0be122 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Apr 2017 00:20:41 +0200 Subject: [PATCH 1034/3220] BACKPORT: cpufreq: schedutil: Use policy-dependent transition delays Make the schedutil governor take the initial (default) value of the rate_limit_us sysfs attribute from the (new) transition_delay_us policy parameter (to be set by the scaling driver). That will allow scaling drivers to make schedutil use smaller default values of rate_limit_us and reduce the default average time interval between consecutive frequency changes. Make intel_pstate set transition_delay_us to 500. BACKPORT: Modified to support the separate up_rate_limit_us and down_rate_limit_us (upstream just has a single rate_limit_us). Also dropped the changes for intel_pstate as there's a merge conflict. Change-Id: I62a8543879a4d8582cdcb31ebd55607705d1c8b1 Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar (cherry picked from commit 1b72e7fd304639f1cd49d1e11955c4974936d88c) Signed-off-by: Brendan Jackman --- include/linux/cpufreq.h | 10 +++++++++- kernel/sched/cpufreq_schedutil.c | 20 +++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b144e7445477..490d577aa144 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -119,7 +119,15 @@ struct cpufreq_policy { bool fast_switch_possible; bool fast_switch_enabled; - /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ + /* + * Preferred average time interval between consecutive invocations of + * the driver to set the frequency for this policy. To be set by the + * scaling driver (0, which is the default, means no preference). + */ + unsigned int up_transition_delay_us; + unsigned int down_transition_delay_us; + + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; int cached_resolved_idx; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a0bcf488fb81..b90f7434e13b 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -615,7 +615,6 @@ static int sugov_init(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy; struct sugov_tunables *tunables; - unsigned int lat; int ret = 0; /* State should be equivalent to EXIT */ @@ -654,12 +653,19 @@ static int sugov_init(struct cpufreq_policy *policy) goto stop_kthread; } - tunables->up_rate_limit_us = LATENCY_MULTIPLIER; - tunables->down_rate_limit_us = LATENCY_MULTIPLIER; - lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (lat) { - tunables->up_rate_limit_us *= lat; - tunables->down_rate_limit_us *= lat; + if (policy->up_transition_delay_us && policy->down_transition_delay_us) { + tunables->up_rate_limit_us = policy->up_transition_delay_us; + tunables->down_rate_limit_us = policy->down_transition_delay_us; + } else { + unsigned int lat; + + tunables->up_rate_limit_us = LATENCY_MULTIPLIER; + tunables->down_rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) { + tunables->up_rate_limit_us *= lat; + tunables->down_rate_limit_us *= lat; + } } policy->governor_data = sg_policy; From 1bab88a22447c96f832aca1115370cfdd75fb168 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 17 Aug 2017 14:38:21 +0100 Subject: [PATCH 1035/3220] ANDROID: cpufreq-dt: Set sane defaults for schedutil rate limits schedfreq used transition_latency as the default value for rate-limiting upward frequency transitions. schedutil instead uses a separate transition_delay_us field - if this is not set, it uses transition_latency * 1000, which is not sensible. The reason for the two separate values (transition_delay and transition_latency) is that they represent different quantities: "latency" reports how long it takes to execute a frequency transition. "delay" is a tunable whose ideal value is one that best balances the power cost of making a frequency transition with the benefit of more often selecting the correct frequency. "latency" is probably a lower bound on "delay". AFAIK we don't currently have a way to directly express the desired transition_delay_us field in the device tree, so for now let's just set it to the same as transition_latency, so we get similar default behaviour for schedutil as we got for schedfreq. This doesn't make any difference if userspace is setting the cpufreq tunables itself, it is just making the defaults saner. Change-Id: Iec92d710c79d9c10d0bef1b617942185448fc214 Signed-off-by: Brendan Jackman --- drivers/cpufreq/cpufreq-dt.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 90d64081ddb3..7bd7a8a5a1f6 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -359,6 +359,13 @@ static int cpufreq_init(struct cpufreq_policy *policy) policy->cpuinfo.transition_latency = transition_latency; + /* + * Android: set default parameters for parity between schedutil and + * schedfreq + */ + policy->up_transition_delay_us = transition_latency / NSEC_PER_USEC; + policy->down_transition_delay_us = 50000; /* 50ms */ + of_node_put(np); return 0; From 3479acf11d421ba1bef80b6d613c0cdc23a01850 Mon Sep 17 00:00:00 2001 From: Greg Kaiser Date: Tue, 5 Sep 2017 15:55:41 -0700 Subject: [PATCH 1036/3220] ANDROID: fiq_debugger: Fix minor bug in code We fix a typo in the code which had us comparing a pointer instead of the value which was being pointed to. This turns out to be a relatively benign bug, as we'd incorrectly pass in the empty string instead of NULL to the function, and the function can handle both. But we fix it so the code is clearly doing what we intend. Signed-off-by: Greg Kaiser Change-Id: Ib059819775a3bebca357d4ce684be779853156e3 --- drivers/staging/android/fiq_debugger/fiq_debugger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c index b132cff14f01..ef9fa483ac00 100644 --- a/drivers/staging/android/fiq_debugger/fiq_debugger.c +++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c @@ -401,7 +401,7 @@ static void fiq_debugger_work(struct work_struct *work) cmd += 6; while (*cmd == ' ') cmd++; - if (cmd != '\0') + if (*cmd != '\0') kernel_restart(cmd); else kernel_restart(NULL); From f238ef77588e2723965a853243b0a70ce3067959 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Thu, 24 Aug 2017 16:57:20 -0700 Subject: [PATCH 1037/3220] Revert "ANDROID: Use sk_uid to replace uid get from socket file" This reverts commit 623f33f213deb39994decbdc7d3b8cea82c4d558. Bug: 37524657 Change-Id: I214dbf417e720c032d28d62b8a74ece5de9cd919 Signed-off-by: Chenbo Feng --- net/netfilter/xt_qtaguid.c | 50 +++++++++++++++++------------ net/netfilter/xt_qtaguid_internal.h | 4 +-- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index fbe4e1ecce6a..f95aba44e649 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1717,9 +1717,18 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) } MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n", par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par)); + if (sk != NULL) { + set_sk_callback_lock = true; + read_lock_bh(&sk->sk_callback_lock); + MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", + par->hooknum, sk, sk->sk_socket, + sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", + par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); + } - - if (sk == NULL) { + if (sk == NULL || sk->sk_socket == NULL) { /* * Here, the qtaguid_find_sk() using connection tracking * couldn't find the owner, so for now we just count them @@ -1732,7 +1741,9 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) */ if (!(info->match & XT_QTAGUID_UID)) account_for_uid(skb, sk, 0, par); - MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum); + MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n", + par->hooknum, + sk ? sk->sk_socket : NULL); res = (info->match ^ info->invert) == 0; atomic64_inc(&qtu_events.match_no_sk); goto put_sock_ret_res; @@ -1740,7 +1751,16 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) res = false; goto put_sock_ret_res; } - sock_uid = sk->sk_uid; + filp = sk->sk_socket->file; + if (filp == NULL) { + MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum); + account_for_uid(skb, sk, 0, par); + res = ((info->match ^ info->invert) & + (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0; + atomic64_inc(&qtu_events.match_no_sk_file); + goto put_sock_ret_res; + } + sock_uid = filp->f_cred->fsuid; /* * TODO: unhack how to force just accounting. * For now we only do iface stats when the uid-owner is not requested @@ -1758,8 +1778,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); - if ((uid_gte(sk->sk_uid, uid_min) && - uid_lte(sk->sk_uid, uid_max)) ^ + if ((uid_gte(filp->f_cred->fsuid, uid_min) && + uid_lte(filp->f_cred->fsuid, uid_max)) ^ !(info->invert & XT_QTAGUID_UID)) { MT_DEBUG("qtaguid[%d]: leaving uid not matching\n", par->hooknum); @@ -1770,19 +1790,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->match & XT_QTAGUID_GID) { kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); - set_sk_callback_lock = true; - read_lock_bh(&sk->sk_callback_lock); - MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", - par->hooknum, sk, sk->sk_socket, - sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); - filp = sk->sk_socket ? sk->sk_socket->file : NULL; - if (!filp) { - res = ((info->match ^ info->invert) & XT_QTAGUID_GID) == 0; - atomic64_inc(&qtu_events.match_no_sk_gid); - goto put_sock_ret_res; - } - MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", - par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); + if ((gid_gte(filp->f_cred->fsgid, gid_min) && gid_lte(filp->f_cred->fsgid, gid_max)) ^ !(info->invert & XT_QTAGUID_GID)) { @@ -1954,7 +1962,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) "match_found_sk_in_ct=%llu " "match_found_no_sk_in_ct=%llu " "match_no_sk=%llu " - "match_no_sk_gid=%llu\n", + "match_no_sk_file=%llu\n", (u64)atomic64_read(&qtu_events.sockets_tagged), (u64)atomic64_read(&qtu_events.sockets_untagged), (u64)atomic64_read(&qtu_events.counter_set_changes), @@ -1966,7 +1974,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) (u64)atomic64_read(&qtu_events.match_found_sk_in_ct), (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct), (u64)atomic64_read(&qtu_events.match_no_sk), - (u64)atomic64_read(&qtu_events.match_no_sk_gid)); + (u64)atomic64_read(&qtu_events.match_no_sk_file)); /* Count the following as part of the last item_index. No need * to lock the sock_tag_list here since it is already locked when diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h index c7052707a6a4..8178fbdfb036 100644 --- a/net/netfilter/xt_qtaguid_internal.h +++ b/net/netfilter/xt_qtaguid_internal.h @@ -289,10 +289,10 @@ struct qtaguid_event_counts { */ atomic64_t match_no_sk; /* - * The file ptr in the sk_socket wasn't there and we couldn't get GID. + * The file ptr in the sk_socket wasn't there. * This might happen for traffic while the socket is being closed. */ - atomic64_t match_no_sk_gid; + atomic64_t match_no_sk_file; }; /* Track the set active_set for the given tag. */ From 5ce41fd1a024dc0f3beff98952d67e514e104602 Mon Sep 17 00:00:00 2001 From: JP Abgrall Date: Fri, 20 Dec 2013 16:51:11 -0800 Subject: [PATCH 1038/3220] ANDROID: nf: xt_qtaguid: fix handling for cases where tunnels are used. * fix skb->dev vs par->in/out When there is some forwarding going on, it introduces extra state around devs associated with xt_action_param->in/out and sk_buff->dev. E.g. par->in and par->out are both set, or skb->dev and par->out are both set (and different) This would lead qtaguid to make the wrong assumption about the direction and update the wrong device stats. Now we rely more on par->in/out. * Fix handling when qtaguid is used as "owner" When qtaguid is used as an owner module, and sk_socket->file is not there (happens when tunnels are involved), it would incorrectly do a tag stats update. * Correct debug messages. Bug: 11687690 Change-Id: I2b1ff8bd7131969ce9e25f8291d83a6280b3ba7f CRs-Fixed: 747810 Signed-off-by: JP Abgrall Git-commit: 2b71479d6f5fe8f33b335f713380f72037244395 Git-repo: https://www.codeaurora.org/cgit/quic/la/kernel/mediatek [imaund@codeaurora.org: Resolved trivial context conflicts.] Signed-off-by: Ian Maund [bflowers@codeaurora.org: Resolved merge conflicts] Signed-off-by: Bryse Flowers Signed-off-by: Chenbo Feng --- net/netfilter/xt_qtaguid.c | 150 +++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 81 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index f95aba44e649..fef69cb21f6f 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1175,6 +1175,38 @@ static void iface_stat_update(struct net_device *net_dev, bool stash_only) spin_unlock_bh(&iface_stat_list_lock); } +/* Guarantied to return a net_device that has a name */ +static void get_dev_and_dir(const struct sk_buff *skb, + struct xt_action_param *par, + enum ifs_tx_rx *direction, + const struct net_device **el_dev) +{ + BUG_ON(!direction || !el_dev); + + if (par->in) { + *el_dev = par->in; + *direction = IFS_RX; + } else if (par->out) { + *el_dev = par->out; + *direction = IFS_TX; + } else { + pr_err("qtaguid[%d]: %s(): no par->in/out?!!\n", + par->hooknum, __func__); + BUG(); + } + if (unlikely(!(*el_dev)->name)) { + pr_err("qtaguid[%d]: %s(): no dev->name?!!\n", + par->hooknum, __func__); + BUG(); + } + if (skb->dev && *el_dev != skb->dev) { + MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs par->%s=%p %s\n", + par->hooknum, skb->dev, skb->dev->name, + *direction == IFS_RX ? "in" : "out", *el_dev, + (*el_dev)->name); + } +} + /* * Update stats for the specified interface from the skb. * Do nothing if the entry @@ -1186,50 +1218,27 @@ static void iface_stat_update_from_skb(const struct sk_buff *skb, { struct iface_stat *entry; const struct net_device *el_dev; - enum ifs_tx_rx direction = par->in ? IFS_RX : IFS_TX; + enum ifs_tx_rx direction; int bytes = skb->len; int proto; - if (!skb->dev) { - MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum); - el_dev = par->in ? : par->out; - } else { - const struct net_device *other_dev; - el_dev = skb->dev; - other_dev = par->in ? : par->out; - if (el_dev != other_dev) { - MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs " - "par->(in/out)=%p %s\n", - par->hooknum, el_dev, el_dev->name, other_dev, - other_dev->name); - } - } - - if (unlikely(!el_dev)) { - pr_err_ratelimited("qtaguid[%d]: %s(): no par->in/out?!!\n", - par->hooknum, __func__); - BUG(); - } else if (unlikely(!el_dev->name)) { - pr_err_ratelimited("qtaguid[%d]: %s(): no dev->name?!!\n", - par->hooknum, __func__); - BUG(); - } else { - proto = ipx_proto(skb, par); - MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n", - par->hooknum, el_dev->name, el_dev->type, - par->family, proto); - } + get_dev_and_dir(skb, par, &direction, &el_dev); + proto = ipx_proto(skb, par); + MT_DEBUG("qtaguid[%d]: iface_stat: %s(%s): " + "type=%d fam=%d proto=%d dir=%d\n", + par->hooknum, __func__, el_dev->name, el_dev->type, + par->family, proto, direction); spin_lock_bh(&iface_stat_list_lock); entry = get_iface_entry(el_dev->name); if (entry == NULL) { - IF_DEBUG("qtaguid: iface_stat: %s(%s): not tracked\n", - __func__, el_dev->name); + IF_DEBUG("qtaguid[%d]: iface_stat: %s(%s): not tracked\n", + par->hooknum, __func__, el_dev->name); spin_unlock_bh(&iface_stat_list_lock); return; } - IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__, + IF_DEBUG("qtaguid[%d]: %s(%s): entry=%p\n", par->hooknum, __func__, el_dev->name, entry); data_counters_update(&entry->totals_via_skb, 0, direction, proto, @@ -1294,14 +1303,14 @@ static void if_tag_stat_update(const char *ifname, uid_t uid, spin_lock_bh(&iface_stat_list_lock); iface_entry = get_iface_entry(ifname); if (!iface_entry) { - pr_err_ratelimited("qtaguid: iface_stat: stat_update() " + pr_err_ratelimited("qtaguid: tag_stat: stat_update() " "%s not found\n", ifname); spin_unlock_bh(&iface_stat_list_lock); return; } /* It is ok to process data when an iface_entry is inactive */ - MT_DEBUG("qtaguid: iface_stat: stat_update() dev=%s entry=%p\n", + MT_DEBUG("qtaguid: tag_stat: stat_update() dev=%s entry=%p\n", ifname, iface_entry); /* @@ -1318,7 +1327,7 @@ static void if_tag_stat_update(const char *ifname, uid_t uid, tag = combine_atag_with_uid(acct_tag, uid); uid_tag = make_tag_from_uid(uid); } - MT_DEBUG("qtaguid: iface_stat: stat_update(): " + MT_DEBUG("qtaguid: tag_stat: stat_update(): " " looking for tag=0x%llx (uid=%u) in ife=%p\n", tag, get_uid_from_tag(tag), iface_entry); /* Loop over tag list under this interface for {acct_tag,uid_tag} */ @@ -1578,8 +1587,8 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb, struct sock *sk; unsigned int hook_mask = (1 << par->hooknum); - MT_DEBUG("qtaguid: find_sk(skb=%p) hooknum=%d family=%d\n", skb, - par->hooknum, par->family); + MT_DEBUG("qtaguid[%d]: find_sk(skb=%p) family=%d\n", + par->hooknum, skb, par->family); /* * Let's not abuse the the xt_socket_get*_sk(), or else it will @@ -1600,8 +1609,8 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb, } if (sk) { - MT_DEBUG("qtaguid: %p->sk_proto=%u " - "->sk_state=%d\n", sk, sk->sk_protocol, sk->sk_state); + MT_DEBUG("qtaguid[%d]: %p->sk_proto=%u->sk_state=%d\n", + par->hooknum, sk, sk->sk_protocol, sk->sk_state); /* * When in TCP_TIME_WAIT the sk is not a "struct sock" but * "struct inet_timewait_sock" which is missing fields. @@ -1619,37 +1628,19 @@ static void account_for_uid(const struct sk_buff *skb, struct xt_action_param *par) { const struct net_device *el_dev; + enum ifs_tx_rx direction; + int proto; - if (!skb->dev) { - MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum); - el_dev = par->in ? : par->out; - } else { - const struct net_device *other_dev; - el_dev = skb->dev; - other_dev = par->in ? : par->out; - if (el_dev != other_dev) { - MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs " - "par->(in/out)=%p %s\n", - par->hooknum, el_dev, el_dev->name, other_dev, - other_dev->name); - } - } + get_dev_and_dir(skb, par, &direction, &el_dev); + proto = ipx_proto(skb, par); + MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d dir=%d\n", + par->hooknum, el_dev->name, el_dev->type, + par->family, proto, direction); - if (unlikely(!el_dev)) { - pr_info("qtaguid[%d]: no par->in/out?!!\n", par->hooknum); - } else if (unlikely(!el_dev->name)) { - pr_info("qtaguid[%d]: no dev->name?!!\n", par->hooknum); - } else { - int proto = ipx_proto(skb, par); - MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n", - par->hooknum, el_dev->name, el_dev->type, - par->family, proto); - - if_tag_stat_update(el_dev->name, uid, - skb->sk ? skb->sk : alternate_sk, - par->in ? IFS_RX : IFS_TX, - proto, skb->len); - } + if_tag_stat_update(el_dev->name, uid, + skb->sk ? skb->sk : alternate_sk, + direction, + proto, skb->len); } static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) @@ -1661,6 +1652,11 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) kuid_t sock_uid; bool res; bool set_sk_callback_lock = false; + /* + * TODO: unhack how to force just accounting. + * For now we only do tag stats when the uid-owner is not requested + */ + bool do_tag_stat = !(info->match & XT_QTAGUID_UID); if (unlikely(module_passive)) return (info->match ^ info->invert) == 0; @@ -1734,12 +1730,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) * couldn't find the owner, so for now we just count them * against the system. */ - /* - * TODO: unhack how to force just accounting. - * For now we only do iface stats when the uid-owner is not - * requested. - */ - if (!(info->match & XT_QTAGUID_UID)) + if (do_tag_stat) account_for_uid(skb, sk, 0, par); MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n", par->hooknum, @@ -1754,18 +1745,15 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) filp = sk->sk_socket->file; if (filp == NULL) { MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum); - account_for_uid(skb, sk, 0, par); + if (do_tag_stat) + account_for_uid(skb, sk, 0, par); res = ((info->match ^ info->invert) & (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0; atomic64_inc(&qtu_events.match_no_sk_file); goto put_sock_ret_res; } sock_uid = filp->f_cred->fsuid; - /* - * TODO: unhack how to force just accounting. - * For now we only do iface stats when the uid-owner is not requested - */ - if (!(info->match & XT_QTAGUID_UID)) + if (do_tag_stat) account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), par); /* From 1e07bd20e4227a009101c8f5c5f3ed66b5ef30d3 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Thu, 20 Apr 2017 18:54:13 -0700 Subject: [PATCH 1039/3220] ANDROID: Use sk_uid to replace uid get from socket file Retrieve socket uid from the sk_uid field added to struct sk instead of read it from sk->socket->file. It prevent the packet been dropped when the socket file doesn't exist. Bug: 37524657 Signed-off-by: Chenbo Feng Change-Id: Ic58239c1f9aa7e0eb1d4d1c09d40b845fd4e8e57 --- net/netfilter/xt_qtaguid.c | 55 +++++++++++++---------------- net/netfilter/xt_qtaguid_internal.h | 4 +-- 2 files changed, 26 insertions(+), 33 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index fef69cb21f6f..6f5cdc5b505f 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1713,18 +1713,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) } MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n", par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par)); - if (sk != NULL) { - set_sk_callback_lock = true; - read_lock_bh(&sk->sk_callback_lock); - MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", - par->hooknum, sk, sk->sk_socket, - sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); - filp = sk->sk_socket ? sk->sk_socket->file : NULL; - MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", - par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); - } - if (sk == NULL || sk->sk_socket == NULL) { + if (!sk) { /* * Here, the qtaguid_find_sk() using connection tracking * couldn't find the owner, so for now we just count them @@ -1732,9 +1722,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) */ if (do_tag_stat) account_for_uid(skb, sk, 0, par); - MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n", - par->hooknum, - sk ? sk->sk_socket : NULL); + MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum); res = (info->match ^ info->invert) == 0; atomic64_inc(&qtu_events.match_no_sk); goto put_sock_ret_res; @@ -1742,19 +1730,10 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) res = false; goto put_sock_ret_res; } - filp = sk->sk_socket->file; - if (filp == NULL) { - MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum); - if (do_tag_stat) - account_for_uid(skb, sk, 0, par); - res = ((info->match ^ info->invert) & - (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0; - atomic64_inc(&qtu_events.match_no_sk_file); - goto put_sock_ret_res; - } - sock_uid = filp->f_cred->fsuid; + sock_uid = sk->sk_uid; if (do_tag_stat) - account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), par); + account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), + par); /* * The following two tests fail the match when: @@ -1766,8 +1745,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); - if ((uid_gte(filp->f_cred->fsuid, uid_min) && - uid_lte(filp->f_cred->fsuid, uid_max)) ^ + if ((uid_gte(sock_uid, uid_min) && + uid_lte(sock_uid, uid_max)) ^ !(info->invert & XT_QTAGUID_UID)) { MT_DEBUG("qtaguid[%d]: leaving uid not matching\n", par->hooknum); @@ -1778,7 +1757,21 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->match & XT_QTAGUID_GID) { kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); - + set_sk_callback_lock = true; + read_lock_bh(&sk->sk_callback_lock); + MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", + par->hooknum, sk, sk->sk_socket, + sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + if (!filp) { + res = ((info->match ^ info->invert) & + XT_QTAGUID_GID) == 0; + atomic64_inc(&qtu_events.match_no_sk_gid); + goto put_sock_ret_res; + } + MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", + par->hooknum, filp ? + from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); if ((gid_gte(filp->f_cred->fsgid, gid_min) && gid_lte(filp->f_cred->fsgid, gid_max)) ^ !(info->invert & XT_QTAGUID_GID)) { @@ -1950,7 +1943,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) "match_found_sk_in_ct=%llu " "match_found_no_sk_in_ct=%llu " "match_no_sk=%llu " - "match_no_sk_file=%llu\n", + "match_no_sk_gid=%llu\n", (u64)atomic64_read(&qtu_events.sockets_tagged), (u64)atomic64_read(&qtu_events.sockets_untagged), (u64)atomic64_read(&qtu_events.counter_set_changes), @@ -1962,7 +1955,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) (u64)atomic64_read(&qtu_events.match_found_sk_in_ct), (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct), (u64)atomic64_read(&qtu_events.match_no_sk), - (u64)atomic64_read(&qtu_events.match_no_sk_file)); + (u64)atomic64_read(&qtu_events.match_no_sk_gid)); /* Count the following as part of the last item_index. No need * to lock the sock_tag_list here since it is already locked when diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h index 8178fbdfb036..c7052707a6a4 100644 --- a/net/netfilter/xt_qtaguid_internal.h +++ b/net/netfilter/xt_qtaguid_internal.h @@ -289,10 +289,10 @@ struct qtaguid_event_counts { */ atomic64_t match_no_sk; /* - * The file ptr in the sk_socket wasn't there. + * The file ptr in the sk_socket wasn't there and we couldn't get GID. * This might happen for traffic while the socket is being closed. */ - atomic64_t match_no_sk_file; + atomic64_t match_no_sk_gid; }; /* Track the set active_set for the given tag. */ From 101ef8fff5eb3b130917ba8c8bfc0def91b6e423 Mon Sep 17 00:00:00 2001 From: gaurav jindal Date: Fri, 8 Sep 2017 00:07:43 +0530 Subject: [PATCH 1040/3220] drivers: cpufreq: checks to avoid kernel crash in cpufreq_interactive In cpufreq_governor_interactive, driver throws warning with WARN_ON for !tunables and event != CPUFREQ_GOV_POLICY_INIT. In case when tunables is NULL for event other than CPUFREQ_GOV_POLICY_INIT, kernel will crash as there is no safe check available before accessing tunables. So to handle such case and avoid the kernel crash, return -EINVAL if WARN_ON returns TRUE. Change-Id: I7a3a22d58e3c8a315a1cc1d31143649dc8807dee Signed-off-by: gaurav jindal --- drivers/cpufreq/cpufreq_interactive.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 889c9b8b2237..52ef2ea6b65c 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -1148,7 +1148,8 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy, else tunables = common_tunables; - WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT)); + if (WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT))) + return -EINVAL; switch (event) { case CPUFREQ_GOV_POLICY_INIT: From e698796fd28862a9ee91f004820016eb8117a195 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 6 Sep 2017 13:06:43 -0700 Subject: [PATCH 1041/3220] ANDROID: mnt: Fix freeing of mount data Fix double free on error paths Signed-off-by: Daniel Rosenberg Change-Id: I1c25a175e87e5dd5cafcdcf9d78bf4c0dc3f88ef Bug: 65386954 Fixes: 6b42d02561d3 ("ANDROID: mnt: Add filesystem private data to mount points") --- fs/namespace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 15b91b36ecab..7e14bf1c851c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -227,6 +227,7 @@ static struct mount *alloc_vfsmnt(const char *name) mnt->mnt_count = 1; mnt->mnt_writers = 0; #endif + mnt->mnt.data = NULL; INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); @@ -976,7 +977,6 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (!mnt) return ERR_PTR(-ENOMEM); - mnt->mnt.data = NULL; if (type->alloc_mnt_data) { mnt->mnt.data = type->alloc_mnt_data(); if (!mnt->mnt.data) { @@ -990,7 +990,6 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void root = mount_fs(type, flags, name, &mnt->mnt, data); if (IS_ERR(root)) { - kfree(mnt->mnt.data); mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_CAST(root); @@ -1094,7 +1093,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return mnt; out_free: - kfree(mnt->mnt.data); mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); From eb6a6f7a16aa5ffac7e56050ec13d8618e35a462 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 19 Jul 2017 17:25:07 -0700 Subject: [PATCH 1042/3220] ANDROID: Sdcardfs: Move gid derivation under flag This moves the code to adjust the gid/uid of lower filesystem files under the mount flag derive_gid. Signed-off-by: Daniel Rosenberg Change-Id: I44eaad4ef67c7fcfda3b6ea3502afab94442610c Bug: 63245673 --- fs/sdcardfs/derived_perm.c | 3 +++ fs/sdcardfs/inode.c | 12 ++++++++---- fs/sdcardfs/main.c | 6 ++++++ fs/sdcardfs/sdcardfs.h | 1 + fs/sdcardfs/super.c | 2 ++ 5 files changed, 20 insertions(+), 4 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 85a60fb5ff39..e0b74dc77c5f 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -176,6 +176,9 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name) gid_t gid = sbi->options.fs_low_gid; struct iattr newattrs; + if (!sbi->options.gid_derivation) + return; + info = SDCARDFS_I(d_inode(dentry)); info_d = info->data; perm = info_d->perm; diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 103dc45a131f..85eb8ebb5372 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -34,10 +34,14 @@ const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, if (!cred) return NULL; - if (data->under_obb) - uid = AID_MEDIA_OBB; - else - uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid); + if (sbi->options.gid_derivation) { + if (data->under_obb) + uid = AID_MEDIA_OBB; + else + uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid); + } else { + uid = sbi->options.fs_low_uid; + } cred->fsuid = make_kuid(&init_user_ns, uid); cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid); diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 80825b287836..1e73b3ace563 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -32,6 +32,7 @@ enum { Opt_multiuser, Opt_userid, Opt_reserved_mb, + Opt_gid_derivation, Opt_err, }; @@ -43,6 +44,7 @@ static const match_table_t sdcardfs_tokens = { {Opt_mask, "mask=%u"}, {Opt_userid, "userid=%d"}, {Opt_multiuser, "multiuser"}, + {Opt_gid_derivation, "derive_gid"}, {Opt_reserved_mb, "reserved_mb=%u"}, {Opt_err, NULL} }; @@ -64,6 +66,8 @@ static int parse_options(struct super_block *sb, char *options, int silent, vfsopts->gid = 0; /* by default, 0MB is reserved */ opts->reserved_mb = 0; + /* by default, gid derivation is off */ + opts->gid_derivation = false; *debug = 0; @@ -115,6 +119,8 @@ static int parse_options(struct super_block *sb, char *options, int silent, return 0; opts->reserved_mb = option; break; + case Opt_gid_derivation: + opts->gid_derivation = true; /* unknown option */ default: if (!silent) diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index b8624fd8b817..88b92b2f1872 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -219,6 +219,7 @@ struct sdcardfs_mount_options { gid_t fs_low_gid; userid_t fs_user_id; bool multiuser; + bool gid_derivation; unsigned int reserved_mb; }; diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index 7f4539b4b249..b89947d878e3 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -302,6 +302,8 @@ static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m, seq_printf(m, ",mask=%u", vfsopts->mask); if (opts->fs_user_id) seq_printf(m, ",userid=%u", opts->fs_user_id); + if (opts->gid_derivation) + seq_puts(m, ",derive_gid"); if (opts->reserved_mb != 0) seq_printf(m, ",reserved=%uMB", opts->reserved_mb); From 3ecb1c95c89295caff001a692f496fcd00ada35d Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 8 Sep 2017 17:20:06 -0700 Subject: [PATCH 1043/3220] ANDROID: sdcardfs: Add missing break Signed-off-by: Daniel Rosenberg Bug: 63245673 Change-Id: I5fc596420301045895e5a9a7e297fd05434babf9 --- fs/sdcardfs/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 1e73b3ace563..0a2b5167e9a2 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -121,6 +121,7 @@ static int parse_options(struct super_block *sb, char *options, int silent, break; case Opt_gid_derivation: opts->gid_derivation = true; + break; /* unknown option */ default: if (!silent) From 2e26e045de9385990ebca686be2130e7dbfe008f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 25 Jun 2016 21:53:30 +1000 Subject: [PATCH 1044/3220] UPSTREAM: Fix build break in fork.c when THREAD_SIZE < PAGE_SIZE Commit b235beea9e99 ("Clarify naming of thread info/stack allocators") breaks the build on some powerpc configs, where THREAD_SIZE < PAGE_SIZE: kernel/fork.c:235:2: error: implicit declaration of function 'free_thread_stack' kernel/fork.c:355:8: error: assignment from incompatible pointer type stack = alloc_thread_stack_node(tsk, node); ^ Fix it by renaming free_stack() to free_thread_stack(), and updating the return type of alloc_thread_stack_node(). Fixes: b235beea9e99 ("Clarify naming of thread info/stack allocators") Signed-off-by: Michael Ellerman Signed-off-by: Linus Torvalds Bug: 38331309 Change-Id: I5b7f920b459fb84adf5fc75f83bb488b855c4deb (cherry picked from commit 9521d39976db20f8ef9b56af66661482a17d5364) Signed-off-by: Zubin Mithra --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 5ee818516a1c..f98b3360397c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -176,13 +176,13 @@ static inline void free_thread_stack(unsigned long *stack) # else static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_stack(unsigned long *stack) +static void free_thread_stack(unsigned long *stack) { kmem_cache_free(thread_stack_cache, stack); } From 2876169271fc1455142d2eb78489814fe213ffea Mon Sep 17 00:00:00 2001 From: gaurav jindal Date: Mon, 18 Sep 2017 00:58:43 +0530 Subject: [PATCH 1045/3220] drivers: cpufreq_interactive: handle error for module load fail If the cpufreq_register_governor fails, resources for thread speedchange_task should be released. currently, concerned resources are released in module_exit, but if module loading fails, exit will not be called and resources will remain acquired. this may leave kernel in an unstable state. Change-Id: Ic33f058c069d30bfd114fa1c1380325c8e00b51c Signed-off-by: gaurav jindal --- drivers/cpufreq/cpufreq_interactive.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 52ef2ea6b65c..e0a586895136 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -1316,6 +1316,7 @@ static int __init cpufreq_interactive_init(void) unsigned int i; struct cpufreq_interactive_cpuinfo *pcpu; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + int ret = 0; /* Initalize per-cpu timers */ for_each_possible_cpu(i) { @@ -1344,7 +1345,12 @@ static int __init cpufreq_interactive_init(void) /* NB: wake up so the thread does not look hung to the freezer */ wake_up_process(speedchange_task); - return cpufreq_register_governor(&cpufreq_gov_interactive); + ret = cpufreq_register_governor(&cpufreq_gov_interactive); + if (ret) { + kthread_stop(speedchange_task); + put_task_struct(speedchange_task); + } + return ret; } #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE From 9cfefbcfaab85139d3b5515d971d487f02d424e8 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Fri, 15 Sep 2017 16:17:26 -0700 Subject: [PATCH 1046/3220] ANDROID: configs: remove config fragments The kernel config fragments for Android have moved into their own repository located at https://android.googlesource.com/kernel/configs/ Bug: 63994171 Change-Id: I837bac54cb5c90e6a6eb0f6f0ad5c90588c1a46a Signed-off-by: Steve Muckle --- android/configs/README | 15 --- android/configs/android-base-arm64.cfg | 5 - android/configs/android-base.cfg | 165 ------------------------ android/configs/android-recommended.cfg | 139 -------------------- 4 files changed, 324 deletions(-) delete mode 100644 android/configs/README delete mode 100644 android/configs/android-base-arm64.cfg delete mode 100644 android/configs/android-base.cfg delete mode 100644 android/configs/android-recommended.cfg diff --git a/android/configs/README b/android/configs/README deleted file mode 100644 index 8798731f8904..000000000000 --- a/android/configs/README +++ /dev/null @@ -1,15 +0,0 @@ -The files in this directory are meant to be used as a base for an Android -kernel config. All devices should have the options in android-base.cfg enabled. -While not mandatory, the options in android-recommended.cfg enable advanced -Android features. - -Assuming you already have a minimalist defconfig for your device, a possible -way to enable these options would be: - - ARCH= scripts/kconfig/merge_config.sh /_defconfig android/configs/android-base.cfg android/configs/android-recommended.cfg - -This will generate a .config that can then be used to save a new defconfig or -compile a new kernel with Android features enabled. - -Because there is no tool to consistently generate these config fragments, -lets keep them alphabetically sorted instead of random. diff --git a/android/configs/android-base-arm64.cfg b/android/configs/android-base-arm64.cfg deleted file mode 100644 index 43f23d6b5391..000000000000 --- a/android/configs/android-base-arm64.cfg +++ /dev/null @@ -1,5 +0,0 @@ -# KEEP ALPHABETICALLY SORTED -CONFIG_ARMV8_DEPRECATED=y -CONFIG_CP15_BARRIER_EMULATION=y -CONFIG_SETEND_EMULATION=y -CONFIG_SWP_EMULATION=y diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg deleted file mode 100644 index 7199561cd42b..000000000000 --- a/android/configs/android-base.cfg +++ /dev/null @@ -1,165 +0,0 @@ -# KEEP ALPHABETICALLY SORTED -# CONFIG_DEVKMEM is not set -# CONFIG_DEVMEM is not set -# CONFIG_FHANDLE is not set -# CONFIG_INET_LRO is not set -# CONFIG_NFSD is not set -# CONFIG_NFS_FS is not set -# CONFIG_OABI_COMPAT is not set -# CONFIG_SYSVIPC is not set -# CONFIG_USELIB is not set -CONFIG_ANDROID=y -CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder -CONFIG_ANDROID_BINDER_IPC=y -CONFIG_ANDROID_LOW_MEMORY_KILLER=y -CONFIG_ASHMEM=y -CONFIG_AUDIT=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_CGROUPS=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_SCHED=y -CONFIG_DEFAULT_SECURITY_SELINUX=y -CONFIG_EMBEDDED=y -CONFIG_FB=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_INET6_AH=y -CONFIG_INET6_ESP=y -CONFIG_INET6_IPCOMP=y -CONFIG_INET=y -CONFIG_INET_DIAG_DESTROY=y -CONFIG_INET_ESP=y -CONFIG_INET_XFRM_MODE_TUNNEL=y -CONFIG_IP6_NF_FILTER=y -CONFIG_IP6_NF_IPTABLES=y -CONFIG_IP6_NF_MANGLE=y -CONFIG_IP6_NF_MATCH_RPFILTER=y -CONFIG_IP6_NF_RAW=y -CONFIG_IP6_NF_TARGET_REJECT=y -CONFIG_IPV6=y -CONFIG_IPV6_MIP6=y -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_ARPFILTER=y -CONFIG_IP_NF_ARPTABLES=y -CONFIG_IP_NF_ARP_MANGLE=y -CONFIG_IP_NF_FILTER=y -CONFIG_IP_NF_IPTABLES=y -CONFIG_IP_NF_MANGLE=y -CONFIG_IP_NF_MATCH_AH=y -CONFIG_IP_NF_MATCH_ECN=y -CONFIG_IP_NF_MATCH_TTL=y -CONFIG_IP_NF_NAT=y -CONFIG_IP_NF_RAW=y -CONFIG_IP_NF_SECURITY=y -CONFIG_IP_NF_TARGET_MASQUERADE=y -CONFIG_IP_NF_TARGET_NETMAP=y -CONFIG_IP_NF_TARGET_REDIRECT=y -CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_NET=y -CONFIG_NETDEVICES=y -CONFIG_NETFILTER=y -CONFIG_NETFILTER_XT_MATCH_COMMENT=y -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y -CONFIG_NETFILTER_XT_MATCH_CONNMARK=y -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y -CONFIG_NETFILTER_XT_MATCH_HELPER=y -CONFIG_NETFILTER_XT_MATCH_IPRANGE=y -CONFIG_NETFILTER_XT_MATCH_LENGTH=y -CONFIG_NETFILTER_XT_MATCH_LIMIT=y -CONFIG_NETFILTER_XT_MATCH_MAC=y -CONFIG_NETFILTER_XT_MATCH_MARK=y -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y -CONFIG_NETFILTER_XT_MATCH_POLICY=y -CONFIG_NETFILTER_XT_MATCH_QTAGUID=y -CONFIG_NETFILTER_XT_MATCH_QUOTA2=y -CONFIG_NETFILTER_XT_MATCH_QUOTA=y -CONFIG_NETFILTER_XT_MATCH_SOCKET=y -CONFIG_NETFILTER_XT_MATCH_STATE=y -CONFIG_NETFILTER_XT_MATCH_STATISTIC=y -CONFIG_NETFILTER_XT_MATCH_STRING=y -CONFIG_NETFILTER_XT_MATCH_TIME=y -CONFIG_NETFILTER_XT_MATCH_U32=y -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y -CONFIG_NETFILTER_XT_TARGET_CONNMARK=y -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y -CONFIG_NETFILTER_XT_TARGET_MARK=y -CONFIG_NETFILTER_XT_TARGET_NFLOG=y -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y -CONFIG_NETFILTER_XT_TARGET_SECMARK=y -CONFIG_NETFILTER_XT_TARGET_TCPMSS=y -CONFIG_NETFILTER_XT_TARGET_TPROXY=y -CONFIG_NETFILTER_XT_TARGET_TRACE=y -CONFIG_NET_CLS_ACT=y -CONFIG_NET_CLS_U32=y -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_U32=y -CONFIG_NET_KEY=y -CONFIG_NET_SCHED=y -CONFIG_NET_SCH_HTB=y -CONFIG_NF_CONNTRACK=y -CONFIG_NF_CONNTRACK_AMANDA=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_FTP=y -CONFIG_NF_CONNTRACK_H323=y -CONFIG_NF_CONNTRACK_IPV4=y -CONFIG_NF_CONNTRACK_IPV6=y -CONFIG_NF_CONNTRACK_IRC=y -CONFIG_NF_CONNTRACK_NETBIOS_NS=y -CONFIG_NF_CONNTRACK_PPTP=y -CONFIG_NF_CONNTRACK_SANE=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_TFTP=y -CONFIG_NF_CT_NETLINK=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_NAT=y -CONFIG_NO_HZ=y -CONFIG_PACKET=y -CONFIG_PM_AUTOSLEEP=y -CONFIG_PM_WAKELOCKS=y -CONFIG_PPP=y -CONFIG_PPPOLAC=y -CONFIG_PPPOPNS=y -CONFIG_PPP_BSDCOMP=y -CONFIG_PPP_DEFLATE=y -CONFIG_PPP_MPPE=y -CONFIG_PREEMPT=y -CONFIG_PROFILING=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_RTC_CLASS=y -CONFIG_RT_GROUP_SCHED=y -CONFIG_SECCOMP=y -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y -CONFIG_SECURITY_SELINUX=y -CONFIG_STAGING=y -CONFIG_SYNC=y -CONFIG_TUN=y -CONFIG_UID_SYS_STATS=y -CONFIG_UNIX=y -CONFIG_USB_CONFIGFS=y -CONFIG_USB_CONFIGFS_F_ACC=y -CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_MIDI=y -CONFIG_USB_CONFIGFS_F_MTP=y -CONFIG_USB_CONFIGFS_F_PTP=y -CONFIG_USB_CONFIGFS_UEVENT=y -CONFIG_USB_GADGET=y -CONFIG_XFRM_USER=y diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg deleted file mode 100644 index 3d7e5e168940..000000000000 --- a/android/configs/android-recommended.cfg +++ /dev/null @@ -1,139 +0,0 @@ -# KEEP ALPHABETICALLY SORTED -# CONFIG_AIO is not set -# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_LEGACY_PTYS is not set -# CONFIG_NF_CONNTRACK_SIP is not set -# CONFIG_PM_WAKELOCKS_GC is not set -# CONFIG_VT is not set -CONFIG_ANDROID_TIMED_GPIO=y -CONFIG_ARM64_SW_TTBR0_PAN=y -CONFIG_ARM_KERNMEM_PERMS=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y -CONFIG_BLK_DEV_DM=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=8192 -CONFIG_CC_STACKPROTECTOR_STRONG=y -CONFIG_COMPACTION=y -CONFIG_CPU_SW_DOMAIN_PAN=y -CONFIG_DEBUG_RODATA=y -CONFIG_DM_CRYPT=y -CONFIG_DM_UEVENT=y -CONFIG_DM_VERITY=y -CONFIG_DM_VERITY_FEC=y -CONFIG_DRAGONRISE_FF=y -CONFIG_ENABLE_DEFAULT_TRACERS=y -CONFIG_EXT4_FS=y -CONFIG_EXT4_FS_SECURITY=y -CONFIG_FUSE_FS=y -CONFIG_GREENASIA_FF=y -CONFIG_HIDRAW=y -CONFIG_HID_A4TECH=y -CONFIG_HID_ACRUX=y -CONFIG_HID_ACRUX_FF=y -CONFIG_HID_APPLE=y -CONFIG_HID_BELKIN=y -CONFIG_HID_CHERRY=y -CONFIG_HID_CHICONY=y -CONFIG_HID_CYPRESS=y -CONFIG_HID_DRAGONRISE=y -CONFIG_HID_ELECOM=y -CONFIG_HID_EMS_FF=y -CONFIG_HID_EZKEY=y -CONFIG_HID_GREENASIA=y -CONFIG_HID_GYRATION=y -CONFIG_HID_HOLTEK=y -CONFIG_HID_KENSINGTON=y -CONFIG_HID_KEYTOUCH=y -CONFIG_HID_KYE=y -CONFIG_HID_LCPOWER=y -CONFIG_HID_LOGITECH=y -CONFIG_HID_LOGITECH_DJ=y -CONFIG_HID_MAGICMOUSE=y -CONFIG_HID_MICROSOFT=y -CONFIG_HID_MONTEREY=y -CONFIG_HID_MULTITOUCH=y -CONFIG_HID_NTRIG=y -CONFIG_HID_ORTEK=y -CONFIG_HID_PANTHERLORD=y -CONFIG_HID_PETALYNX=y -CONFIG_HID_PICOLCD=y -CONFIG_HID_PRIMAX=y -CONFIG_HID_PRODIKEYS=y -CONFIG_HID_ROCCAT=y -CONFIG_HID_SAITEK=y -CONFIG_HID_SAMSUNG=y -CONFIG_HID_SMARTJOYPLUS=y -CONFIG_HID_SONY=y -CONFIG_HID_SPEEDLINK=y -CONFIG_HID_SUNPLUS=y -CONFIG_HID_THRUSTMASTER=y -CONFIG_HID_TIVO=y -CONFIG_HID_TOPSEED=y -CONFIG_HID_TWINHAN=y -CONFIG_HID_UCLOGIC=y -CONFIG_HID_WACOM=y -CONFIG_HID_WALTOP=y -CONFIG_HID_WIIMOTE=y -CONFIG_HID_ZEROPLUS=y -CONFIG_HID_ZYDACRON=y -CONFIG_INPUT_EVDEV=y -CONFIG_INPUT_GPIO=y -CONFIG_INPUT_JOYSTICK=y -CONFIG_INPUT_KEYCHORD=y -CONFIG_INPUT_KEYRESET=y -CONFIG_INPUT_MISC=y -CONFIG_INPUT_TABLET=y -CONFIG_INPUT_UINPUT=y -CONFIG_ION=y -CONFIG_JOYSTICK_XPAD=y -CONFIG_JOYSTICK_XPAD_FF=y -CONFIG_JOYSTICK_XPAD_LEDS=y -CONFIG_KALLSYMS_ALL=y -CONFIG_KSM=y -CONFIG_LOGIG940_FF=y -CONFIG_LOGIRUMBLEPAD2_FF=y -CONFIG_LOGITECH_FF=y -CONFIG_MD=y -CONFIG_MEDIA_SUPPORT=y -CONFIG_MEMORY_STATE_TIME=y -CONFIG_MSDOS_FS=y -CONFIG_PANIC_TIMEOUT=5 -CONFIG_PANTHERLORD_FF=y -CONFIG_PERF_EVENTS=y -CONFIG_PM_DEBUG=y -CONFIG_PM_RUNTIME=y -CONFIG_PM_WAKELOCKS_LIMIT=0 -CONFIG_POWER_SUPPLY=y -CONFIG_PSTORE=y -CONFIG_PSTORE_CONSOLE=y -CONFIG_PSTORE_RAM=y -CONFIG_QFMT_V2=y -CONFIG_QUOTA=y -CONFIG_QUOTACTL=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -CONFIG_QUOTA_TREE=y -CONFIG_SCHEDSTATS=y -CONFIG_SMARTJOYPLUS_FF=y -CONFIG_SND=y -CONFIG_SOUND=y -CONFIG_SUSPEND_TIME=y -CONFIG_TABLET_USB_ACECAD=y -CONFIG_TABLET_USB_AIPTEK=y -CONFIG_TABLET_USB_GTCO=y -CONFIG_TABLET_USB_HANWANG=y -CONFIG_TABLET_USB_KBTAB=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_TASK_XACCT=y -CONFIG_TIMER_STATS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_UHID=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -CONFIG_USB_EHCI_HCD=y -CONFIG_USB_HIDDEV=y -CONFIG_USB_USBNET=y -CONFIG_VFAT_FS=y From 849c7764d8081c029cd7d5d2a5f9ace1a4bd084f Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Fri, 8 Sep 2017 02:09:26 -0400 Subject: [PATCH 1047/3220] FROMLIST: android: binder: Drop lru lock in isolate callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (from https://patchwork.kernel.org/patch/9945123/) Drop the global lru lock in isolate callback before calling zap_page_range which calls cond_resched, and re-acquire the global lru lock before returning. Also change return code to LRU_REMOVED_RETRY. Use mmput_async when fail to acquire mmap sem in an atomic context. Fix "BUG: sleeping function called from invalid context" errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled. Bug: 63926541 Change-Id: I45dbada421b715abed9a66d03d30ae2285671ca1 Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder") Reported-by: Kyle Yan Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 1c1a7d9c86ed..9839afa84dfd 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -913,6 +913,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, struct binder_alloc *alloc; uintptr_t page_addr; size_t index; + struct vm_area_struct *vma; alloc = page->alloc; if (!mutex_trylock(&alloc->mutex)) @@ -923,16 +924,22 @@ enum lru_status binder_alloc_free_page(struct list_head *item, index = page - alloc->pages; page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; - if (alloc->vma) { + vma = alloc->vma; + if (vma) { mm = get_task_mm(alloc->tsk); if (!mm) goto err_get_task_mm_failed; if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; + } + list_lru_isolate(lru, item); + spin_unlock(lock); + + if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range(alloc->vma, + zap_page_range(vma, page_addr + alloc->user_buffer_offset, PAGE_SIZE, NULL); @@ -951,13 +958,12 @@ enum lru_status binder_alloc_free_page(struct list_head *item, trace_binder_unmap_kernel_end(alloc, index); - list_lru_isolate(lru, item); - + spin_lock(lock); mutex_unlock(&alloc->mutex); - return LRU_REMOVED; + return LRU_REMOVED_RETRY; err_down_write_mmap_sem_failed: - mmput(mm); + mmput_async(mm); err_get_task_mm_failed: err_page_already_freed: mutex_unlock(&alloc->mutex); From e6fa28a9a9c0f4ad120f1b6d4bfe2300e658d514 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Fri, 15 Sep 2017 20:40:03 -0400 Subject: [PATCH 1048/3220] FROMLIST: android: binder: Remove unused vma argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (from https://patchwork.kernel.org/patch/9954123/) The vma argument in update_page_range is no longer used after 74310e06 ("android: binder: Move buffer out of area shared with user space"), since mmap_handler no longer calls update_page_range with a vma. Test: ran binderLibTest, throughputtest, interfacetest and mempressure Bug: 36007193 Change-Id: Ibd6f24c11750f8f7e6ed56e40dd18c08e02ace25 Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 9839afa84dfd..cfc3f5664015 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -186,12 +186,12 @@ struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc, } static int binder_update_page_range(struct binder_alloc *alloc, int allocate, - void *start, void *end, - struct vm_area_struct *vma) + void *start, void *end) { void *page_addr; unsigned long user_page_addr; struct binder_lru_page *page; + struct vm_area_struct *vma = NULL; struct mm_struct *mm = NULL; bool need_mm = false; @@ -215,7 +215,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (!vma && need_mm) + if (need_mm) mm = get_task_mm(alloc->tsk); if (mm) { @@ -442,7 +442,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, if (end_page_addr > has_page_addr) end_page_addr = has_page_addr; ret = binder_update_page_range(alloc, 1, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL); + (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr); if (ret) return ERR_PTR(ret); @@ -483,7 +483,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, err_alloc_buf_struct_failed: binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), - end_page_addr, NULL); + end_page_addr); return ERR_PTR(-ENOMEM); } @@ -567,8 +567,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, alloc->pid, buffer->data, prev->data, next->data); binder_update_page_range(alloc, 0, buffer_start_page(buffer), - buffer_start_page(buffer) + PAGE_SIZE, - NULL); + buffer_start_page(buffer) + PAGE_SIZE); } list_del(&buffer->entry); kfree(buffer); @@ -605,8 +604,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), - (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK), - NULL); + (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK)); rb_erase(&buffer->rb_node, &alloc->allocated_buffers); buffer->free = 1; From 9b9d7cf19106f1b2429ca192b8bfd8ffce7b859b Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Fri, 15 Sep 2017 21:12:15 -0400 Subject: [PATCH 1049/3220] FROMLIST: android: binder: Don't get mm from task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (from https://patchwork.kernel.org/patch/9954125/) Use binder_alloc struct's mm_struct rather than getting a reference to the mm struct through get_task_mm to avoid a potential deadlock between lru lock, task lock and dentry lock, since a thread can be holding the task lock and the dentry lock while trying to acquire the lru lock. Test: ran binderLibTest, throughputtest, interfacetest and mempressure w/lockdep Bug: 63926541 Change-Id: Icc661404eb7a4a2ecc5234b1bf8f0104665f9b45 Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 25 ++++++++++++------------- drivers/android/binder_alloc.h | 1 - 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index cfc3f5664015..b95da16fd938 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -215,17 +215,13 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (need_mm) - mm = get_task_mm(alloc->tsk); + /* Same as mmget_not_zero() in later kernel versions */ + if (need_mm && atomic_inc_not_zero(&alloc->vma_vm_mm->mm_users)) + mm = alloc->vma_vm_mm; if (mm) { down_write(&mm->mmap_sem); vma = alloc->vma; - if (vma && mm != alloc->vma_vm_mm) { - pr_err("%d: vma mm and task mm mismatch\n", - alloc->pid); - vma = NULL; - } } if (!vma && need_mm) { @@ -718,6 +714,8 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, barrier(); alloc->vma = vma; alloc->vma_vm_mm = vma->vm_mm; + /* Same as mmgrab() in later kernel versions */ + atomic_inc(&alloc->vma_vm_mm->mm_count); return 0; @@ -793,6 +791,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) vfree(alloc->buffer); } mutex_unlock(&alloc->mutex); + if (alloc->vma_vm_mm) + mmdrop(alloc->vma_vm_mm); binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d buffers %d, pages %d\n", @@ -887,7 +887,6 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc) void binder_alloc_vma_close(struct binder_alloc *alloc) { WRITE_ONCE(alloc->vma, NULL); - WRITE_ONCE(alloc->vma_vm_mm, NULL); } /** @@ -924,9 +923,10 @@ enum lru_status binder_alloc_free_page(struct list_head *item, page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; vma = alloc->vma; if (vma) { - mm = get_task_mm(alloc->tsk); - if (!mm) - goto err_get_task_mm_failed; + /* Same as mmget_not_zero() in later kernel versions */ + if (!atomic_inc_not_zero(&alloc->vma_vm_mm->mm_users)) + goto err_mmget; + mm = alloc->vma_vm_mm; if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; } @@ -962,7 +962,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, err_down_write_mmap_sem_failed: mmput_async(mm); -err_get_task_mm_failed: +err_mmget: err_page_already_freed: mutex_unlock(&alloc->mutex); err_get_alloc_mutex_failed: @@ -1001,7 +1001,6 @@ struct shrinker binder_shrinker = { */ void binder_alloc_init(struct binder_alloc *alloc) { - alloc->tsk = current->group_leader; alloc->pid = current->group_leader->pid; mutex_init(&alloc->mutex); INIT_LIST_HEAD(&alloc->buffers); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index a3a3602c689c..2dd33b6df104 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -100,7 +100,6 @@ struct binder_lru_page { */ struct binder_alloc { struct mutex mutex; - struct task_struct *tsk; struct vm_area_struct *vma; struct mm_struct *vma_vm_mm; void *buffer; From 047200481e7f98e6334d751010c9470efb531b71 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:21 -0700 Subject: [PATCH 1050/3220] BACKPORT: partial: mm, oom_reaper: do not mmput synchronously from the oom reaper context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit ec8d7c14ea14922fe21945b458a75e39f11dd832) Tetsuo has properly noted that mmput slow path might get blocked waiting for another party (e.g. exit_aio waits for an IO). If that happens the oom_reaper would be put out of the way and will not be able to process next oom victim. We should strive for making this context as reliable and independent on other subsystems as much as possible. Introduce mmput_async which will perform the slow path from an async (WQ) context. This will delay the operation but that shouldn't be a problem because the oom_reaper has reclaimed the victim's address space for most cases as much as possible and the remaining context shouldn't bind too much memory anymore. The only exception is when mmap_sem trylock has failed which shouldn't happen too often. The issue is only theoretical but not impossible. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Only backports mmput_async. Change-Id: I5fe54abcc629e7d9eab9fe03908903d1174177f1 Signed-off-by: Arve Hjønnevåg --- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 5 ++++ kernel/fork.c | 52 +++++++++++++++++++++++++++------------- 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b32cb7add09c..907f94428ccc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -523,6 +524,7 @@ struct mm_struct { #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif + struct work_struct async_put_work; }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/include/linux/sched.h b/include/linux/sched.h index a6800fb3f4ea..d0d0b1b45418 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2764,6 +2764,11 @@ static inline void mmdrop(struct mm_struct * mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); +/* same as above but performs the slow path from the async kontext. Can + * be called from the atomic context as well + */ +extern void mmput_async(struct mm_struct *); + /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); /* diff --git a/kernel/fork.c b/kernel/fork.c index f98b3360397c..2633d5f61d3c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -695,6 +695,26 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +static inline void __mmput(struct mm_struct *mm) +{ + VM_BUG_ON(atomic_read(&mm->mm_users)); + + uprobe_clear_state(mm); + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); +} + /* * Decrement the use count and release all resources for an mm. */ @@ -702,25 +722,25 @@ void mmput(struct mm_struct *mm) { might_sleep(); - if (atomic_dec_and_test(&mm->mm_users)) { - uprobe_clear_state(mm); - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); - } + if (atomic_dec_and_test(&mm->mm_users)) + __mmput(mm); } EXPORT_SYMBOL_GPL(mmput); +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); + } +} + /** * set_mm_exe_file - change a reference to the mm's executable file * From 650b6a5c418563f2a6cb4f12b3402a5b4870eea8 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 21 Sep 2017 17:24:24 -0700 Subject: [PATCH 1051/3220] Revert "ANDROID: sched/tune: Initialize raw_spin_lock in boosted_groups" This reverts commit c5616f2f874faa20b59b116177b99bf3948586df. If we re-init the per-cpu boostgroup spinlock every time that we add a new boosted cgroup, we can easily wipe out (reinit) a spinlock struct while in a critical section. We should only be setting up the per-cpu boostgroup data, and the spin_lock initialization need only happen once - which we're already doing in a postcore_initcall. For example: -------- CPU 0 -------- | -------- CPU1 -------- cgroupX boost group added | schedtune_enqueue_task | acquires(bg->lock) | cgroupY boost group added | for_each_cpu() | raw_spin_lock_init(bg->lock) releases(bg->lock) | BUG (already unlocked) | | This results in the following BUG from the debug spinlock code: BUG: spinlock already unlocked on CPU#5, rcuop/6/68 Change-Id: I3016702780b461a0cd95e26c538cd18df27d6316 Signed-off-by: Vikram Mulukutla --- kernel/sched/tune.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 86d04caf1684..d444fc1a4d58 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -647,7 +647,6 @@ schedtune_boostgroup_init(struct schedtune *st) bg = &per_cpu(cpu_boost_groups, cpu); bg->group[st->idx].boost = 0; bg->group[st->idx].tasks = 0; - raw_spin_lock_init(&bg->lock); } return 0; From c1286ff41c2f610df0cc853e78819ca67a48ffe2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 17:18:56 -0700 Subject: [PATCH 1052/3220] f2fs: backport from (4c1fad64 - Merge tag 'for-f2fs-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs) Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 18 +- fs/Kconfig | 2 + fs/Makefile | 1 + fs/crypto/Kconfig | 18 + fs/crypto/Makefile | 3 + fs/crypto/crypto.c | 568 +++++++++ fs/{f2fs/crypto_fname.c => crypto/fname.c} | 278 ++-- fs/crypto/keyinfo.c | 304 +++++ fs/crypto/policy.c | 246 ++++ fs/f2fs/Kconfig | 21 +- fs/f2fs/Makefile | 2 - fs/f2fs/acl.c | 29 +- fs/f2fs/acl.h | 3 +- fs/f2fs/checkpoint.c | 529 +++++--- fs/f2fs/crypto.c | 491 -------- fs/f2fs/data.c | 1250 ++++++++++-------- fs/f2fs/debug.c | 80 +- fs/f2fs/dir.c | 477 +++---- fs/f2fs/extent_cache.c | 315 ++--- fs/f2fs/f2fs.h | 1001 +++++++++------ fs/f2fs/file.c | 1328 ++++++++++++++------ fs/f2fs/gc.c | 353 ++++-- fs/f2fs/gc.h | 8 - fs/f2fs/hash.c | 7 +- fs/f2fs/inline.c | 268 ++-- fs/f2fs/inode.c | 176 +-- fs/f2fs/namei.c | 409 +++--- fs/f2fs/node.c | 733 +++++++---- fs/f2fs/node.h | 123 +- fs/f2fs/recovery.c | 274 ++-- fs/f2fs/segment.c | 698 +++++----- fs/f2fs/segment.h | 47 +- fs/f2fs/shrinker.c | 8 +- fs/f2fs/super.c | 788 +++++++++--- fs/f2fs/trace.c | 6 +- fs/f2fs/xattr.c | 69 +- fs/f2fs/xattr.h | 3 +- include/linux/dcache.h | 1 + include/linux/f2fs_fs.h | 44 +- include/linux/fs.h | 7 + include/linux/fscrypto.h | 435 +++++++ include/trace/events/f2fs.h | 64 +- include/uapi/linux/fs.h | 18 + 43 files changed, 7560 insertions(+), 3943 deletions(-) create mode 100644 fs/crypto/Kconfig create mode 100644 fs/crypto/Makefile create mode 100644 fs/crypto/crypto.c rename fs/{f2fs/crypto_fname.c => crypto/fname.c} (53%) create mode 100644 fs/crypto/keyinfo.c create mode 100644 fs/crypto/policy.c delete mode 100644 fs/f2fs/crypto.c create mode 100644 include/linux/fscrypto.h diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index b102b436563e..753dd4f96afe 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -102,14 +102,16 @@ background_gc=%s Turn on/off cleaning operations, namely garbage collection, triggered in background when I/O subsystem is idle. If background_gc=on, it will turn on the garbage collection and if background_gc=off, garbage collection - will be truned off. If background_gc=sync, it will turn + will be turned off. If background_gc=sync, it will turn on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. disable_roll_forward Disable the roll-forward recovery routine norecovery Disable the roll-forward recovery routine, mounted read- only (i.e., -o ro,disable_roll_forward) -discard Issue discard/TRIM commands when a segment is cleaned. +discard/nodiscard Enable/disable real-time discard in f2fs, if discard is + enabled, f2fs will issue discard/TRIM commands when a + segment is cleaned. no_heap Disable heap-style segment allocation which finds free segments for data from the beginning of main area, while for node from the end of main area. @@ -129,6 +131,7 @@ inline_dentry Enable the inline dir feature: data in new created directory entries can be written into inode block. The space of inode block which is used to store inline dentries is limited to ~3.4k. +noinline_dentry Diable the inline dentry feature. flush_merge Merge concurrent cache_flush commands as much as possible to eliminate redundant command issues. If the underlying device handles the cache_flush command relatively slowly, @@ -145,10 +148,15 @@ extent_cache Enable an extent cache based on rb-tree, it can cache as many as extent which map between contiguous logical address and physical address per inode, resulting in increasing the cache hit ratio. Set by default. -noextent_cache Diable an extent cache based on rb-tree explicitly, see +noextent_cache Disable an extent cache based on rb-tree explicitly, see the above extent_cache mount option. noinline_data Disable the inline data feature, inline data feature is enabled by default. +data_flush Enable data flushing before checkpoint in order to + persist data of regular and symlink. +mode=%s Control block allocation mode which supports "adaptive" + and "lfs". In "lfs" mode, there should be no random + writes towards main area. ================================================================================ DEBUGFS ENTRIES @@ -192,7 +200,7 @@ Files in /sys/fs/f2fs/ policy for garbage collection. Setting gc_idle = 0 (default) will disable this option. Setting gc_idle = 1 will select the Cost Benefit approach - & setting gc_idle = 2 will select the greedy aproach. + & setting gc_idle = 2 will select the greedy approach. reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree @@ -298,7 +306,7 @@ The dump.f2fs shows the information of specific inode and dumps SSA and SIT to file. Each file is dump_ssa and dump_sit. The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem. -It shows on-disk inode information reconized by a given inode number, and is +It shows on-disk inode information recognized by a given inode number, and is able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and ./dump_sit respectively. diff --git a/fs/Kconfig b/fs/Kconfig index a5d2dc39ba07..80af05163579 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -73,6 +73,8 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. +source "fs/crypto/Kconfig" + source "fs/notify/Kconfig" source "fs/quota/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index dee237540bc0..4644db462ba9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FS_DAX) += dax.o +obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig new file mode 100644 index 000000000000..92348faf9865 --- /dev/null +++ b/fs/crypto/Kconfig @@ -0,0 +1,18 @@ +config FS_ENCRYPTION + tristate "FS Encryption (Per-file encryption)" + depends on BLOCK + select CRYPTO + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_ECB + select CRYPTO_XTS + select CRYPTO_CTS + select CRYPTO_CTR + select CRYPTO_SHA256 + select KEYS + select ENCRYPTED_KEYS + help + Enable encryption of files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile new file mode 100644 index 000000000000..f17684c48739 --- /dev/null +++ b/fs/crypto/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o + +fscrypto-y := crypto.o fname.o policy.o keyinfo.o diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c new file mode 100644 index 000000000000..2fc8c43ce531 --- /dev/null +++ b/fs/crypto/crypto.c @@ -0,0 +1,568 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *fscrypt_bounce_page_pool = NULL; + +static LIST_HEAD(fscrypt_free_ctxs); +static DEFINE_SPINLOCK(fscrypt_ctx_lock); + +static struct workqueue_struct *fscrypt_read_workqueue; +static DEFINE_MUTEX(fscrypt_init_mutex); + +static struct kmem_cache *fscrypt_ctx_cachep; +struct kmem_cache *fscrypt_info_cachep; + +/** + * fscrypt_release_ctx() - Releases an encryption context + * @ctx: The encryption context to release. + * + * If the encryption context was allocated from the pre-allocated pool, returns + * it to that pool. Else, frees it. + * + * If there's a bounce page in the context, this frees that. + */ +void fscrypt_release_ctx(struct fscrypt_ctx *ctx) +{ + unsigned long flags; + + if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); + ctx->w.bounce_page = NULL; + } + ctx->w.control_page = NULL; + if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { + kmem_cache_free(fscrypt_ctx_cachep, ctx); + } else { + spin_lock_irqsave(&fscrypt_ctx_lock, flags); + list_add(&ctx->free_list, &fscrypt_free_ctxs); + spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); + } +} +EXPORT_SYMBOL(fscrypt_release_ctx); + +/** + * fscrypt_get_ctx() - Gets an encryption context + * @inode: The inode for which we are doing the crypto + * @gfp_flags: The gfp flag for memory allocation + * + * Allocates and initializes an encryption context. + * + * Return: An allocated and initialized encryption context on success; error + * value or NULL otherwise. + */ +struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +{ + struct fscrypt_ctx *ctx = NULL; + struct fscrypt_info *ci = inode->i_crypt_info; + unsigned long flags; + + if (ci == NULL) + return ERR_PTR(-ENOKEY); + + /* + * We first try getting the ctx from a free list because in + * the common case the ctx will have an allocated and + * initialized crypto tfm, so it's probably a worthwhile + * optimization. For the bounce page, we first try getting it + * from the kernel allocator because that's just about as fast + * as getting it from a list and because a cache of free pages + * should generally be a "last resort" option for a filesystem + * to be able to do its job. + */ + spin_lock_irqsave(&fscrypt_ctx_lock, flags); + ctx = list_first_entry_or_null(&fscrypt_free_ctxs, + struct fscrypt_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); + if (!ctx) { + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags); + if (!ctx) + return ERR_PTR(-ENOMEM); + ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->flags &= ~FS_WRITE_PATH_FL; + return ctx; +} +EXPORT_SYMBOL(fscrypt_get_ctx); + +/** + * fscrypt_complete() - The completion callback for page encryption + * @req: The asynchronous encryption request context + * @res: The result of the encryption operation + */ +static void fscrypt_complete(struct crypto_async_request *req, int res) +{ + struct fscrypt_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + +static int do_page_crypto(struct inode *inode, + fscrypt_direction_t rw, pgoff_t index, + struct page *src_page, struct page *dest_page, + gfp_t gfp_flags) +{ + u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct scatterlist dst, src; + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; + int res = 0; + + req = skcipher_request_alloc(tfm, gfp_flags); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", + __func__); + return -ENOMEM; + } + + skcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + fscrypt_complete, &ecr); + + BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); + memcpy(xts_tweak, &index, sizeof(index)); + memset(&xts_tweak[sizeof(index)], 0, + FS_XTS_TWEAK_SIZE - sizeof(index)); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_SIZE, 0); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, + xts_tweak); + if (rw == FS_DECRYPT) + res = crypto_skcipher_decrypt(req); + else + res = crypto_skcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + skcipher_request_free(req); + if (res) { + printk_ratelimited(KERN_ERR + "%s: crypto_skcipher_encrypt() returned %d\n", + __func__, res); + return res; + } + return 0; +} + +static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +{ + ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags |= FS_WRITE_PATH_FL; + return ctx->w.bounce_page; +} + +/** + * fscypt_encrypt_page() - Encrypts a page + * @inode: The inode for which the encryption should take place + * @plaintext_page: The page to encrypt. Must be locked. + * @gfp_flags: The gfp flag for memory allocation + * + * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx + * encryption context. + * + * Called on the page write path. The caller must call + * fscrypt_restore_control_page() on the returned ciphertext page to + * release the bounce buffer and the encryption context. + * + * Return: An allocated page with the encrypted content on success. Else, an + * error value or NULL. + */ +struct page *fscrypt_encrypt_page(struct inode *inode, + struct page *plaintext_page, gfp_t gfp_flags) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + int err; + + BUG_ON(!PageLocked(plaintext_page)); + + ctx = fscrypt_get_ctx(inode, gfp_flags); + if (IS_ERR(ctx)) + return (struct page *)ctx; + + /* The encryption operation will require a bounce page. */ + ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + if (IS_ERR(ciphertext_page)) + goto errout; + + ctx->w.control_page = plaintext_page; + err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, + plaintext_page, ciphertext_page, + gfp_flags); + if (err) { + ciphertext_page = ERR_PTR(err); + goto errout; + } + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)ctx); + lock_page(ciphertext_page); + return ciphertext_page; + +errout: + fscrypt_release_ctx(ctx); + return ciphertext_page; +} +EXPORT_SYMBOL(fscrypt_encrypt_page); + +/** + * f2crypt_decrypt_page() - Decrypts a page in-place + * @page: The page to decrypt. Must be locked. + * + * Decrypts page in-place using the ctx encryption context. + * + * Called from the read completion callback. + * + * Return: Zero on success, non-zero otherwise. + */ +int fscrypt_decrypt_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + return do_page_crypto(page->mapping->host, + FS_DECRYPT, page->index, page, page, GFP_NOFS); +} +EXPORT_SYMBOL(fscrypt_decrypt_page); + +int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(WRITE, bio); + if ((err == 0) && bio->bi_error) + err = -EIO; + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); + +/* + * Validate dentries for encrypted directories to make sure we aren't + * potentially caching stale data after a key has been added or + * removed. + */ +static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *dir; + struct fscrypt_info *ci; + int dir_has_key, cached_with_key; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dget_parent(dentry); + if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { + dput(dir); + return 0; + } + + ci = d_inode(dir)->i_crypt_info; + if (ci && ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD)))) + ci = NULL; + + /* this should eventually be an flag in d_flags */ + spin_lock(&dentry->d_lock); + cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); + dir_has_key = (ci != NULL); + dput(dir); + + /* + * If the dentry was cached without the key, and it is a + * negative dentry, it might be a valid name. We can't check + * if the key has since been made available due to locking + * reasons, so we fail the validation so ext4_lookup() can do + * this check. + * + * We also fail the validation if the dentry was created with + * the key present, but we no longer have the key, or vice versa. + */ + if ((!cached_with_key && d_is_negative(dentry)) || + (!cached_with_key && dir_has_key) || + (cached_with_key && !dir_has_key)) + return 0; + return 1; +} + +const struct dentry_operations fscrypt_d_ops = { + .d_revalidate = fscrypt_d_revalidate, +}; +EXPORT_SYMBOL(fscrypt_d_ops); + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +void fscrypt_restore_control_page(struct page *page) +{ + struct fscrypt_ctx *ctx; + + ctx = (struct fscrypt_ctx *)page_private(page); + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + fscrypt_release_ctx(ctx); +} +EXPORT_SYMBOL(fscrypt_restore_control_page); + +static void fscrypt_destroy(void) +{ + struct fscrypt_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list) + kmem_cache_free(fscrypt_ctx_cachep, pos); + INIT_LIST_HEAD(&fscrypt_free_ctxs); + mempool_destroy(fscrypt_bounce_page_pool); + fscrypt_bounce_page_pool = NULL; +} + +/** + * fscrypt_initialize() - allocate major buffers for fs encryption. + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int fscrypt_initialize(void) +{ + int i, res = -ENOMEM; + + if (fscrypt_bounce_page_pool) + return 0; + + mutex_lock(&fscrypt_init_mutex); + if (fscrypt_bounce_page_pool) + goto already_initialized; + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct fscrypt_ctx *ctx; + + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS); + if (!ctx) + goto fail; + list_add(&ctx->free_list, &fscrypt_free_ctxs); + } + + fscrypt_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!fscrypt_bounce_page_pool) + goto fail; + +already_initialized: + mutex_unlock(&fscrypt_init_mutex); + return 0; +fail: + fscrypt_destroy(); + mutex_unlock(&fscrypt_init_mutex); + return res; +} +EXPORT_SYMBOL(fscrypt_initialize); + +/** + * fscrypt_init() - Set up for fs encryption. + */ +static int __init fscrypt_init(void) +{ + fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue", + WQ_HIGHPRI, 0); + if (!fscrypt_read_workqueue) + goto fail; + + fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_ctx_cachep) + goto fail_free_queue; + + fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_info_cachep) + goto fail_free_ctx; + + return 0; + +fail_free_ctx: + kmem_cache_destroy(fscrypt_ctx_cachep); +fail_free_queue: + destroy_workqueue(fscrypt_read_workqueue); +fail: + return -ENOMEM; +} +module_init(fscrypt_init) + +/** + * fscrypt_exit() - Shutdown the fs encryption system + */ +static void __exit fscrypt_exit(void) +{ + fscrypt_destroy(); + + if (fscrypt_read_workqueue) + destroy_workqueue(fscrypt_read_workqueue); + kmem_cache_destroy(fscrypt_ctx_cachep); + kmem_cache_destroy(fscrypt_info_cachep); +} +module_exit(fscrypt_exit); + +MODULE_LICENSE("GPL"); diff --git a/fs/f2fs/crypto_fname.c b/fs/crypto/fname.c similarity index 53% rename from fs/f2fs/crypto_fname.c rename to fs/crypto/fname.c index 38349ed5ea51..5d6d49113efa 100644 --- a/fs/f2fs/crypto_fname.c +++ b/fs/crypto/fname.c @@ -1,46 +1,32 @@ /* - * linux/fs/f2fs/crypto_fname.c - * - * Copied from linux/fs/ext4/crypto.c + * This contains functions for filename crypto management * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility * - * This contains functions for filename crypto management in f2fs - * * Written by Uday Savagaonkar, 2014. - * - * Adjust f2fs dentry structure - * Jaegeuk Kim, 2015. + * Modified by Jaegeuk Kim, 2015. * * This has not yet undergone a rigorous security audit. */ -#include -#include + #include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include #include +#include -#include "f2fs.h" -#include "f2fs_crypto.h" -#include "xattr.h" +static u32 size_round_up(size_t size, size_t blksize) +{ + return ((size + blksize - 1) / blksize) * blksize; +} /** - * f2fs_dir_crypt_complete() - + * dir_crypt_complete() - */ -static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) +static void dir_crypt_complete(struct crypto_async_request *req, int res) { - struct f2fs_completion_result *ecr = req->data; + struct fscrypt_completion_result *ecr = req->data; if (res == -EINPROGRESS) return; @@ -48,45 +34,35 @@ static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -bool f2fs_valid_filenames_enc_mode(uint32_t mode) -{ - return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS); -} - -static unsigned max_name_len(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : - F2FS_NAME_LEN; -} - /** - * f2fs_fname_encrypt() - + * fname_encrypt() - * * This function encrypts the input filename, and returns the length of the * ciphertext. Errors are returned as negative numbers. We trust the caller to * allocate sufficient memory to oname string. */ -static int f2fs_fname_encrypt(struct inode *inode, - const struct qstr *iname, struct f2fs_str *oname) +static int fname_encrypt(struct inode *inode, + const struct qstr *iname, struct fscrypt_str *oname) { u32 ciphertext_len; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - char iv[F2FS_CRYPTO_BLOCK_SIZE]; + char iv[FS_CRYPTO_BLOCK_SIZE]; struct scatterlist src_sg, dst_sg; - int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); + int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim = max_name_len(inode); + unsigned lim; + lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ? - F2FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? + FS_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = size_round_up(ciphertext_len, padding); ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; if (ciphertext_len <= sizeof(buf)) { @@ -99,16 +75,16 @@ static int f2fs_fname_encrypt(struct inode *inode, } /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); kfree(alloc_buf); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_dir_crypt_complete, &ecr); + dir_crypt_complete, &ecr); /* Copy the input */ memcpy(workbuf, iname->name, iname->len); @@ -116,79 +92,78 @@ static int f2fs_fname_encrypt(struct inode *inode, memset(workbuf + iname->len, 0, ciphertext_len - iname->len); /* Initialize IV */ - memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); /* Create encryption request */ sg_init_one(&src_sg, workbuf, ciphertext_len); sg_init_one(&dst_sg, oname->name, ciphertext_len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); - res = crypto_ablkcipher_encrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } kfree(alloc_buf); - ablkcipher_request_free(req); - if (res < 0) { + skcipher_request_free(req); + if (res < 0) printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); - } + oname->len = ciphertext_len; return res; } /* - * f2fs_fname_decrypt() + * fname_decrypt() * This function decrypts the input filename, and returns * the length of the plaintext. * Errors are returned as negative numbers. * We trust the caller to allocate sufficient memory to oname string. */ -static int f2fs_fname_decrypt(struct inode *inode, - const struct f2fs_str *iname, struct f2fs_str *oname) +static int fname_decrypt(struct inode *inode, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - char iv[F2FS_CRYPTO_BLOCK_SIZE]; - unsigned lim = max_name_len(inode); + char iv[FS_CRYPTO_BLOCK_SIZE]; + unsigned lim; + lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_dir_crypt_complete, &ecr); + dir_crypt_complete, &ecr); /* Initialize IV */ - memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); /* Create decryption request */ sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_ablkcipher_decrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_skcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - ablkcipher_request_free(req); + skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR - "%s: Error in f2fs_fname_decrypt (error code %d)\n", - __func__, res); + "%s: Error (error code %d)\n", __func__, res); return res; } @@ -200,7 +175,7 @@ static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; /** - * f2fs_fname_encode_digest() - + * digest_encode() - * * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. * The encoded string is roughly 4/3 times the size of the input string. @@ -249,148 +224,152 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -/** - * f2fs_fname_crypto_round_up() - - * - * Return: The next multiple of block size - */ -u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize) +u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) { - return ((size + blksize - 1) / blksize) * blksize; + int padding = 32; + struct fscrypt_info *ci = inode->i_crypt_info; + + if (ci) + padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); + if (ilen < FS_CRYPTO_BLOCK_SIZE) + ilen = FS_CRYPTO_BLOCK_SIZE; + return size_round_up(ilen, padding); } +EXPORT_SYMBOL(fscrypt_fname_encrypted_size); /** - * f2fs_fname_crypto_alloc_obuff() - + * fscrypt_fname_crypto_alloc_obuff() - * * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int f2fs_fname_crypto_alloc_buffer(struct inode *inode, - u32 ilen, struct f2fs_str *crypto_str) +int fscrypt_fname_alloc_buffer(struct inode *inode, + u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen; - int padding = 16; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); - if (ci) - padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); - if (padding < F2FS_CRYPTO_BLOCK_SIZE) - padding = F2FS_CRYPTO_BLOCK_SIZE; - olen = f2fs_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; - if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2; - /* Allocated buffer can hold one more character to null-terminate the - * string */ + if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) + olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + /* + * Allocated buffer can hold one more character to null-terminate the + * string + */ crypto_str->name = kmalloc(olen + 1, GFP_NOFS); if (!(crypto_str->name)) return -ENOMEM; return 0; } +EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * f2fs_fname_crypto_free_buffer() - + * fscrypt_fname_crypto_free_buffer() - * * Frees the buffer allocated for crypto operation. */ -void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str) +void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { if (!crypto_str) return; kfree(crypto_str->name); crypto_str->name = NULL; } +EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** - * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space + * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user + * space */ -int f2fs_fname_disk_to_usr(struct inode *inode, - f2fs_hash_t *hash, - const struct f2fs_str *iname, - struct f2fs_str *oname) +int fscrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; int ret; - if (is_dot_dotdot(&qname)) { + if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return oname->len; } - if (F2FS_I(inode)->i_crypt_info) - return f2fs_fname_decrypt(inode, iname, oname); + if (iname->len < FS_CRYPTO_BLOCK_SIZE) + return -EUCLEAN; - if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (inode->i_crypt_info) + return fname_decrypt(inode, iname, oname); + + if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { ret = digest_encode(iname->name, iname->len, oname->name); oname->len = ret; return ret; } if (hash) { - memcpy(buf, hash, 4); - memset(buf + 4, 0, 4); - } else + memcpy(buf, &hash, 4); + memcpy(buf + 4, &minor_hash, 4); + } else { memset(buf, 0, 8); - memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16); + } + memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; ret = digest_encode(buf, 24, oname->name + 1); oname->len = ret + 1; return ret + 1; } +EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** - * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space + * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk + * space */ -int f2fs_fname_usr_to_disk(struct inode *inode, +int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, - struct f2fs_str *oname) + struct fscrypt_str *oname) { - int res; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (is_dot_dotdot(iname)) { + if (fscrypt_is_dot_dotdot(iname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return oname->len; } - - if (ci) { - res = f2fs_fname_encrypt(inode, iname, oname); - return res; - } - /* Without a proper key, a user is not allowed to modify the filenames + if (inode->i_crypt_info) + return fname_encrypt(inode, iname, oname); + /* + * Without a proper key, a user is not allowed to modify the filenames * in a directory. Consequently, a user space name cannot be mapped to - * a disk-space name */ + * a disk-space name + */ return -EACCES; } +EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); -int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, - int lookup, struct f2fs_filename *fname) +int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct fscrypt_name *fname) { - struct f2fs_crypt_info *ci; int ret = 0, bigname = 0; - memset(fname, 0, sizeof(struct f2fs_filename)); + memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; - if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) { + if (!dir->i_sb->s_cop->is_encrypted(dir) || + fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } - ret = f2fs_get_encryption_info(dir); - if (ret) + ret = get_crypt_info(dir); + if (ret && ret != -EOPNOTSUPP) return ret; - ci = F2FS_I(dir)->i_crypt_info; - if (ci) { - ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len, - &fname->crypto_buf); + + if (dir->i_crypt_info) { + ret = fscrypt_fname_alloc_buffer(dir, iname->len, + &fname->crypto_buf); if (ret < 0) return ret; - ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf); + ret = fname_encrypt(dir, iname, &fname->crypto_buf); if (ret < 0) goto errout; fname->disk_name.name = fname->crypto_buf.name; @@ -400,18 +379,19 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, if (!lookup) return -EACCES; - /* We don't have the key and we are doing a lookup; decode the + /* + * We don't have the key and we are doing a lookup; decode the * user-supplied name */ if (iname->name[0] == '_') bigname = 1; - if ((bigname && (iname->len != 33)) || - (!bigname && (iname->len > 43))) + if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) return -ENOENT; fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; + ret = digest_decode(iname->name + bigname, iname->len - bigname, fname->crypto_buf.name); if (ret < 0) { @@ -421,20 +401,24 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, fname->crypto_buf.len = ret; if (bigname) { memcpy(&fname->hash, fname->crypto_buf.name, 4); + memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; } return 0; + errout: - f2fs_fname_crypto_free_buffer(&fname->crypto_buf); + fscrypt_fname_free_buffer(&fname->crypto_buf); return ret; } +EXPORT_SYMBOL(fscrypt_setup_filename); -void f2fs_fname_free_filename(struct f2fs_filename *fname) +void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; fname->usr_fname = NULL; fname->disk_name.name = NULL; } +EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c new file mode 100644 index 000000000000..1ac263eddc4e --- /dev/null +++ b/fs/crypto/keyinfo.c @@ -0,0 +1,304 @@ +/* + * key management facility for FS encryption support. + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#include +#include +#include +#include +#include +#include + +static void derive_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct fscrypt_completion_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->res = rc; + complete(&ecr->completion); +} + +/** + * derive_key_aes() - Derive a key using AES-128-ECB + * @deriving_key: Encryption key used for derivation. + * @source_key: Source key to which to apply derivation. + * @derived_key: Derived key. + * + * Return: Zero on success; non-zero otherwise. + */ +static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], + u8 source_key[FS_AES_256_XTS_KEY_SIZE], + u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) +{ + int res = 0; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = skcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + skcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + derive_crypt_complete, &ecr); + res = crypto_skcipher_setkey(tfm, deriving_key, + FS_AES_128_ECB_KEY_SIZE); + if (res < 0) + goto out; + + sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); + sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, + FS_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_skcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + wait_for_completion(&ecr.completion); + res = ecr.res; + } +out: + skcipher_request_free(req); + crypto_free_skcipher(tfm); + return res; +} + +static int validate_user_key(struct fscrypt_info *crypt_info, + struct fscrypt_context *ctx, u8 *raw_key, + u8 *prefix, int prefix_size) +{ + u8 *full_key_descriptor; + struct key *keyring_key; + struct fscrypt_key *master_key; + const struct user_key_payload *ukp; + int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; + int res; + + full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); + if (!full_key_descriptor) + return -ENOMEM; + + memcpy(full_key_descriptor, prefix, prefix_size); + sprintf(full_key_descriptor + prefix_size, + "%*phN", FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + full_key_descriptor[full_key_len - 1] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + kfree(full_key_descriptor); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING + "%s: key type must be logon\n", __func__); + res = -ENOKEY; + goto out; + } + down_read(&keyring_key->sem); + ukp = user_key_payload(keyring_key); + if (ukp->datalen != sizeof(struct fscrypt_key)) { + res = -EINVAL; + up_read(&keyring_key->sem); + goto out; + } + master_key = (struct fscrypt_key *)ukp->data; + BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); + + if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "%s: key size incorrect: %d\n", + __func__, master_key->size); + res = -ENOKEY; + up_read(&keyring_key->sem); + goto out; + } + res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); + if (res) + goto out; + + crypt_info->ci_keyring_key = keyring_key; + return 0; +out: + key_put(keyring_key); + return res; +} + +static void put_crypt_info(struct fscrypt_info *ci) +{ + if (!ci) + return; + + key_put(ci->ci_keyring_key); + crypto_free_skcipher(ci->ci_ctfm); + kmem_cache_free(fscrypt_info_cachep, ci); +} + +int get_crypt_info(struct inode *inode) +{ + struct fscrypt_info *crypt_info; + struct fscrypt_context ctx; + struct crypto_skcipher *ctfm; + const char *cipher_str; + u8 raw_key[FS_MAX_KEY_SIZE]; + u8 mode; + int res; + + res = fscrypt_initialize(); + if (res) + return res; + + if (!inode->i_sb->s_cop->get_context) + return -EOPNOTSUPP; +retry: + crypt_info = ACCESS_ONCE(inode->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) + return 0; + fscrypt_put_encryption_info(inode, crypt_info); + goto retry; + } + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0) { + if (!fscrypt_dummy_context_enabled(inode)) + return res; + ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + } else if (res != sizeof(ctx)) { + return -EINVAL; + } + res = 0; + + crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); + if (S_ISREG(inode->i_mode)) + mode = crypt_info->ci_data_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + mode = crypt_info->ci_filename_mode; + else + BUG(); + + switch (mode) { + case FS_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case FS_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "%s: unsupported key mode %d (ino %u)\n", + __func__, mode, (unsigned) inode->i_ino); + res = -ENOKEY; + goto out; + } + if (fscrypt_dummy_context_enabled(inode)) { + memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); + goto got_key; + } + + res = validate_user_key(crypt_info, &ctx, raw_key, + FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + if (res && inode->i_sb->s_cop->key_prefix) { + u8 *prefix = NULL; + int prefix_size, res2; + + prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); + res2 = validate_user_key(crypt_info, &ctx, raw_key, + prefix, prefix_size); + if (res2) { + if (res2 == -ENOKEY) + res = -ENOKEY; + goto out; + } + } else if (res) { + goto out; + } +got_key: + ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_skcipher_clear_flags(ctfm, ~0); + crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + if (res) + goto out; + + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { + put_crypt_info(crypt_info); + goto retry; + } + return 0; + +out: + if (res == -ENOKEY) + res = 0; + put_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); + return res; +} + +void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) +{ + struct fscrypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(inode->i_crypt_info); + if (ci == NULL) + return; + + prev = cmpxchg(&inode->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + put_crypt_info(ci); +} +EXPORT_SYMBOL(fscrypt_put_encryption_info); + +int fscrypt_get_encryption_info(struct inode *inode) +{ + struct fscrypt_info *ci = inode->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return get_crypt_info(inode); + return 0; +} +EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c new file mode 100644 index 000000000000..ed115acb5dee --- /dev/null +++ b/fs/crypto/policy.c @@ -0,0 +1,246 @@ +/* + * Encryption policy functions for per-file encryption support. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#include +#include +#include +#include + +static int inode_has_encryption_context(struct inode *inode) +{ + if (!inode->i_sb->s_cop->get_context) + return 0; + return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); +} + +/* + * check whether the policy is consistent with the encryption context + * for the inode + */ +static int is_encryption_context_consistent_with_policy(struct inode *inode, + const struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->get_context) + return 0; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return 0; + + return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == policy->flags) && + (ctx.contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx.filenames_encryption_mode == + policy->filenames_encryption_mode)); +} + +static int create_encryption_context_from_policy(struct inode *inode, + const struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->set_context) + return -EOPNOTSUPP; + + if (inode->i_sb->s_cop->prepare_context) { + res = inode->i_sb->s_cop->prepare_context(inode); + if (res) + return res; + } + + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; + memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE); + + if (!fscrypt_valid_contents_enc_mode( + policy->contents_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid contents encryption mode %d\n", __func__, + policy->contents_encryption_mode); + return -EINVAL; + } + + if (!fscrypt_valid_filenames_enc_mode( + policy->filenames_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid filenames encryption mode %d\n", __func__, + policy->filenames_encryption_mode); + return -EINVAL; + } + + if (policy->flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; + + ctx.contents_encryption_mode = policy->contents_encryption_mode; + ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; + BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE); + get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + + return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); +} + +int fscrypt_process_policy(struct file *filp, + const struct fscrypt_policy *policy) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (policy->version != 0) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!inode_has_encryption_context(inode)) { + if (!S_ISDIR(inode->i_mode)) + ret = -EINVAL; + else if (!inode->i_sb->s_cop->empty_dir) + ret = -EOPNOTSUPP; + else if (!inode->i_sb->s_cop->empty_dir(inode)) + ret = -ENOTEMPTY; + else + ret = create_encryption_context_from_policy(inode, + policy); + } else if (!is_encryption_context_consistent_with_policy(inode, + policy)) { + printk(KERN_WARNING + "%s: Policy inconsistent with encryption context\n", + __func__); + ret = -EINVAL; + } + + mnt_drop_write_file(filp); + return ret; +} +EXPORT_SYMBOL(fscrypt_process_policy); + +int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->get_context || + !inode->i_sb->s_cop->is_encrypted(inode)) + return -ENODATA; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return -ENODATA; + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + policy->version = 0; + policy->contents_encryption_mode = ctx.contents_encryption_mode; + policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; + memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE); + return 0; +} +EXPORT_SYMBOL(fscrypt_get_policy); + +int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) +{ + struct fscrypt_info *parent_ci, *child_ci; + int res; + + if ((parent == NULL) || (child == NULL)) { + printk(KERN_ERR "parent %p child %p\n", parent, child); + BUG_ON(1); + } + + /* no restrictions if the parent directory is not encrypted */ + if (!parent->i_sb->s_cop->is_encrypted(parent)) + return 1; + /* if the child directory is not encrypted, this is always a problem */ + if (!parent->i_sb->s_cop->is_encrypted(child)) + return 0; + res = fscrypt_get_encryption_info(parent); + if (res) + return 0; + res = fscrypt_get_encryption_info(child); + if (res) + return 0; + parent_ci = parent->i_crypt_info; + child_ci = child->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) + return 0; + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); +} +EXPORT_SYMBOL(fscrypt_has_permitted_context); + +/** + * fscrypt_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * @fs_data: private data given by FS. + * @preload: preload child i_crypt_info + * + * Return: Zero on success, non-zero otherwise + */ +int fscrypt_inherit_context(struct inode *parent, struct inode *child, + void *fs_data, bool preload) +{ + struct fscrypt_context ctx; + struct fscrypt_info *ci; + int res; + + if (!parent->i_sb->s_cop->set_context) + return -EOPNOTSUPP; + + res = fscrypt_get_encryption_info(parent); + if (res < 0) + return res; + + ci = parent->i_crypt_info; + if (ci == NULL) + return -ENOKEY; + + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; + if (fscrypt_dummy_context_enabled(parent)) { + ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); + } + get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + res = parent->i_sb->s_cop->set_context(child, &ctx, + sizeof(ctx), fs_data); + if (res) + return res; + return preload ? fscrypt_get_encryption_info(child): 0; +} +EXPORT_SYMBOL(fscrypt_inherit_context); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index b0a9dc929f88..1852d99df97b 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -1,6 +1,9 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK + select CRYPTO + select KEYS + select CRYPTO_CRC32 help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -76,15 +79,7 @@ config F2FS_FS_ENCRYPTION bool "F2FS Encryption" depends on F2FS_FS depends on F2FS_FS_XATTR - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_ECB - select CRYPTO_XTS - select CRYPTO_CTS - select CRYPTO_CTR - select CRYPTO_SHA256 - select KEYS - select ENCRYPTED_KEYS + select FS_ENCRYPTION help Enable encryption of f2fs files and directories. This feature is similar to ecryptfs, but it is more memory @@ -100,3 +95,11 @@ config F2FS_IO_TRACE information and block IO patterns in the filesystem level. If unsure, say N. + +config F2FS_FAULT_INJECTION + bool "F2FS fault injection facility" + depends on F2FS_FS + help + Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. + + If unsure, say N. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 08e101ed914c..ca949ea7c02f 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,5 +7,3 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o -f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \ - crypto_key.o crypto_fname.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 83dcf7bfd7b8..fb0744b94c2f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -109,14 +109,16 @@ fail: return ERR_PTR(-EINVAL); } -static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) +static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, + const struct posix_acl *acl, size_t *size) { struct f2fs_acl_header *f2fs_acl; struct f2fs_acl_entry *entry; int i; - f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * - sizeof(struct f2fs_acl_entry), GFP_NOFS); + f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) + + acl->a_count * sizeof(struct f2fs_acl_entry), + GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -175,7 +177,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { - value = kmalloc(retval, GFP_F2FS_ZERO); + value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, @@ -204,7 +206,6 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { - struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; @@ -213,11 +214,13 @@ static int __f2fs_set_acl(struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl && !ipage) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) + if (acl) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) return error; - set_acl_inode(fi, inode->i_mode); + set_acl_inode(inode, inode->i_mode); + if (error == 0) + acl = NULL; } break; @@ -232,9 +235,9 @@ static int __f2fs_set_acl(struct inode *inode, int type, } if (acl) { - value = f2fs_acl_to_disk(acl, &size); + value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return (int)PTR_ERR(value); } } @@ -245,7 +248,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, if (!error) set_cached_acl(inode, type, acl); - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return error; } @@ -386,6 +389,8 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; + f2fs_mark_inode_dirty_sync(inode); + if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, ipage); diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 997ca8edb6cb..2c685185c24d 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -37,11 +37,10 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else -#define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f661d80474be..cb23d6cf676b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,6 +26,14 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *inode_entry_slab; +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +{ + set_ckpt_flags(sbi, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; + if (!end_io) + f2fs_flush_merged_bios(sbi); +} + /* * We guarantee no failure on the returned page. */ @@ -34,13 +42,14 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; } - f2fs_wait_on_page_writeback(page, META); - SetPageUptodate(page); + f2fs_wait_on_page_writeback(page, META, true); + if (!PageUptodate(page)) + SetPageUptodate(page); return page; } @@ -56,14 +65,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .sbi = sbi, .type = META, .rw = READ_SYNC | REQ_META | REQ_PRIO, - .blk_addr = index, + .old_blkaddr = index, + .new_blkaddr = index, .encrypted_page = NULL, }; if (unlikely(!is_meta)) fio.rw &= ~REQ_META; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; @@ -90,7 +100,7 @@ repeat: * meta page. */ if (unlikely(!PageUptodate(page))) - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); out: return page; } @@ -143,7 +153,6 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) { - block_t prev_blk_addr = 0; struct page *page; block_t blkno = start; struct f2fs_io_info fio = { @@ -152,10 +161,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, .encrypted_page = NULL, }; + struct blk_plug plug; if (unlikely(type == META_POR)) fio.rw &= ~REQ_META; + blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { if (!is_valid_blkaddr(sbi, blkno, type)) @@ -167,27 +178,25 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) blkno = 0; /* get nat block addr */ - fio.blk_addr = current_nat_addr(sbi, + fio.new_blkaddr = current_nat_addr(sbi, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: /* get sit block addr */ - fio.blk_addr = current_sit_addr(sbi, + fio.new_blkaddr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); - if (blkno != start && prev_blk_addr + 1 != fio.blk_addr) - goto out; - prev_blk_addr = fio.blk_addr; break; case META_SSA: case META_CP: case META_POR: - fio.blk_addr = blkno; + fio.new_blkaddr = blkno; break; default: BUG(); } - page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr); + page = f2fs_grab_cache_page(META_MAPPING(sbi), + fio.new_blkaddr, false); if (!page) continue; if (PageUptodate(page)) { @@ -196,11 +205,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; + fio.old_blkaddr = fio.new_blkaddr; f2fs_submit_page_mbio(&fio); f2fs_put_page(page, 0); } out: f2fs_submit_merged_bio(sbi, META, READ); + blk_finish_plug(&plug); return blkno - start; } @@ -210,7 +221,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) bool readahead = false; page = find_get_page(META_MAPPING(sbi), index); - if (!page || (page && !PageUptodate(page))) + if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); @@ -232,13 +243,17 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); dec_page_count(sbi, F2FS_DIRTY_META); - unlock_page(page); if (wbc->for_reclaim) + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, META, WRITE); + return 0; redirty_out: @@ -252,13 +267,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; - trace_f2fs_writepages(mapping->host, wbc, META); - /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, META); + /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); diff = nr_pages_to_write(sbi, META, wbc); @@ -269,6 +284,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, skip_write: wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); + trace_f2fs_writepages(mapping->host, wbc, META); return 0; } @@ -276,15 +292,18 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX; + pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; struct pagevec pvec; long nwritten = 0; struct writeback_control wbc = { .for_reclaim = 0, }; + struct blk_plug plug; pagevec_init(&pvec, 0); + blk_start_plug(&plug); + while (index <= end) { int i, nr_pages; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, @@ -296,7 +315,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (prev == LONG_MAX) + if (prev == ULONG_MAX) prev = page->index - 1; if (nr_to_write != LONG_MAX && page->index != prev + 1) { pagevec_release(&pvec); @@ -315,6 +334,9 @@ continue_unlock: goto continue_unlock; } + f2fs_wait_on_page_writeback(page, META, true); + + BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; @@ -334,6 +356,8 @@ stop: if (nwritten) f2fs_submit_merged_bio(sbi, type, WRITE); + blk_finish_plug(&plug); + return nwritten; } @@ -341,9 +365,10 @@ static int f2fs_set_meta_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, META); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); SetPagePrivate(page); f2fs_trace_pid(page); @@ -358,6 +383,9 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -410,13 +438,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, type); } -void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); @@ -434,12 +462,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_dirty_inode(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; - for (i = APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -459,6 +487,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) int err = 0; spin_lock(&im->ino_lock); + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ORPHAN)) { + spin_unlock(&im->ino_lock); + return -ENOSPC; + } +#endif if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else @@ -478,10 +513,11 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) spin_unlock(&im->ino_lock); } -void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +void add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ - __add_ino_entry(sbi, ino, ORPHAN_INO); + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO); + update_inode_page(inode); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -493,8 +529,20 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; + struct node_info ni; + int err = acquire_orphan_inode(sbi); - inode = f2fs_iget(sbi->sb, ino); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; + } + + __add_ino_entry(sbi, ino, ORPHAN_INO); + + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { /* * there should be a bug that we can't find the entry @@ -508,6 +556,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); + + get_node_info(sbi, ino, &ni); + + /* ENOMEM was fully retried in f2fs_evict_inode. */ + if (ni.blk_addr != NULL_ADDR) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return -EIO; + } + __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; } @@ -516,7 +576,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) block_t start_blk, orphan_blocks, i, j; int err; - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) + if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -540,7 +600,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); } /* clear Orphan Flag */ - clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); + clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); return 0; } @@ -601,45 +661,55 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) } } +static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, + struct f2fs_checkpoint **cp_block, struct page **cp_page, + unsigned long long *version) +{ + unsigned long blk_size = sbi->blocksize; + size_t crc_offset = 0; + __u32 crc = 0; + + *cp_page = get_meta_page(sbi, cp_addr); + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); + + crc_offset = le32_to_cpu((*cp_block)->checksum_offset); + if (crc_offset >= blk_size) { + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid crc_offset: %zu", crc_offset); + return -EINVAL; + } + + crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block + + crc_offset))); + if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { + f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); + return -EINVAL; + } + + *version = cur_cp_version(*cp_block); + return 0; +} + static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, block_t cp_addr, unsigned long long *version) { - struct page *cp_page_1, *cp_page_2 = NULL; - unsigned long blk_size = sbi->blocksize; - struct f2fs_checkpoint *cp_block; + struct page *cp_page_1 = NULL, *cp_page_2 = NULL; + struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; - size_t crc_offset; - __u32 crc = 0; + int err; - /* Read the 1st cp block in this CP pack */ - cp_page_1 = get_meta_page(sbi, cp_addr); - - /* get the version number */ - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_1, version); + if (err) goto invalid_cp1; + pre_version = *version; - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(crc, cp_block, crc_offset)) - goto invalid_cp1; - - pre_version = cur_cp_version(cp_block); - - /* Read the 2nd cp block in this CP pack */ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; - cp_page_2 = get_meta_page(sbi, cp_addr); - - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_2, version); + if (err) goto invalid_cp2; - - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(crc, cp_block, crc_offset)) - goto invalid_cp2; - - cur_version = cur_cp_version(cp_block); + cur_version = *version; if (cur_version == pre_version) { *version = cur_version; @@ -696,6 +766,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); + /* Sanity checking of checkpoint */ + if (sanity_check_ckpt(sbi)) + goto fail_no_cp; + if (cp_blks <= 1) goto done; @@ -722,118 +796,94 @@ fail_no_cp: return -EINVAL; } -static int __add_dirty_inode(struct inode *inode, struct inode_entry *new) +static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) - return -EEXIST; + if (is_inode_flag_set(inode, flag)) + return; - set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - F2FS_I(inode)->dirty_dir = new; - list_add_tail(&new->list, &sbi->dir_inode_list); - stat_inc_dirty_dir(sbi); - return 0; + set_inode_flag(inode, flag); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); +} + +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) +{ + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) + return; + + list_del_init(&F2FS_I(inode)->dirty_list); + clear_inode_flag(inode, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); } void update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new; - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; - if (!S_ISDIR(inode->i_mode)) { - inode_inc_dirty_pages(inode); - goto out; - } - - new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); + spin_lock(&sbi->inode_lock[type]); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) + __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); - spin_unlock(&sbi->dir_inode_lock); + spin_unlock(&sbi->inode_lock[type]); - if (ret) - kmem_cache_free(inode_entry_slab, new); -out: SetPagePrivate(page); f2fs_trace_pid(page); } -void add_dirty_dir_inode(struct inode *inode) +void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new = - f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); - spin_unlock(&sbi->dir_inode_lock); - - if (ret) - kmem_cache_free(inode_entry_slab, new); -} - -void remove_dirty_dir_inode(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *entry; - - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; - spin_lock(&sbi->dir_inode_lock); - if (get_dirty_pages(inode) || - !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { - spin_unlock(&sbi->dir_inode_lock); + if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) return; - } - entry = F2FS_I(inode)->dirty_dir; - list_del(&entry->list); - F2FS_I(inode)->dirty_dir = NULL; - clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - stat_dec_dirty_dir(sbi); - spin_unlock(&sbi->dir_inode_lock); - kmem_cache_free(inode_entry_slab, entry); - - /* Only from the recovery routine */ - if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { - clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); - iput(inode); - } + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); } -void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; - struct inode_entry *entry; struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - spin_lock(&sbi->dir_inode_lock); + spin_lock(&sbi->inode_lock[type]); - head = &sbi->dir_inode_list; + head = &sbi->inode_list[type]; if (list_empty(head)) { - spin_unlock(&sbi->dir_inode_lock); - return; + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; } - entry = list_entry(head->next, struct inode_entry, list); - inode = igrab(entry->inode); - spin_unlock(&sbi->dir_inode_lock); + fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); if (inode) { filemap_fdatawrite(inode->i_mapping); iput(inode); @@ -848,6 +898,34 @@ retry: goto retry; } +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[DIRTY_META]; + struct inode *inode; + struct f2fs_inode_info *fi; + s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); + + while (total--) { + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 0; + } + fi = list_entry(head->next, struct f2fs_inode_info, + gdirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + if (inode) { + update_inode_page(inode); + iput(inode); + } + }; + return 0; +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -868,11 +946,17 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - sync_dirty_dir_inodes(sbi); - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; + err = sync_dirty_inodes(sbi, DIR_INODE); + if (err) + goto out; + goto retry_flush_dents; + } + + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) goto out; - } goto retry_flush_dents; } @@ -885,10 +969,9 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - sync_node_pages(sbi, 0, &wbc); - if (unlikely(f2fs_cp_error(sbi))) { + err = sync_node_pages(sbi, &wbc); + if (err) { f2fs_unlock_all(sbi); - err = -EIO; goto out; } goto retry_flush_nodes; @@ -901,6 +984,8 @@ out: static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); + + build_free_nids(sbi); f2fs_unlock_all(sbi); } @@ -911,18 +996,48 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!get_pages(sbi, F2FS_WRITEBACK)) + if (!atomic_read(&sbi->nr_wb_bios)) break; - io_schedule(); + io_schedule_timeout(5*HZ); } finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + spin_lock(&sbi->cp_lock); + + if (cpc->reason == CP_UMOUNT) + __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason == CP_FASTBOOT) + __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) + __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + + /* set this flag to activate crc|cp_ver for recovery */ + __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + + spin_unlock(&sbi->cp_lock); +} + +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; nid_t last_nid = nm_i->next_scan_nid; @@ -931,21 +1046,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); - block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg); - bool invalidate = false; - - /* - * This avoids to conduct wrong roll-forward operations and uses - * metapages, so should be called prior to sync_meta_pages below. - */ - if (discard_next_dnode(sbi, discard_blk)) - invalidate = true; + struct super_block *sb = sbi->sb; + struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + u64 kbytes_written; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; } next_free_nid(sbi, &last_nid); @@ -980,10 +1089,12 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); + spin_lock(&sbi->cp_lock); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) - set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else - clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + spin_unlock(&sbi->cp_lock); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -998,29 +1109,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) cp_payload_blks + data_sum_blocks + orphan_blocks); - if (cpc->reason == CP_UMOUNT) - set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - else - clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - - if (cpc->reason == CP_FASTBOOT) - set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - else - clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - - if (orphan_num) - set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - else - clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - set_ckpt_flags(ckpt, CP_FSCK_FLAG); + /* update ckpt flag for checkpoint */ + update_ckpt_flags(sbi, cpc); /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); - crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); + crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset)); *((__le32 *)((unsigned char *)ckpt + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); @@ -1030,7 +1126,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1046,6 +1142,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; + + /* Record write statistics in the hot node summary */ + kbytes_written = sbi->kbytes_written; + if (sb->s_bdev->bd_part) + kbytes_written += BD_PART_WRITTEN(sbi); + + seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); + if (__remain_node_summaries(cpc->reason)) { write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; @@ -1058,14 +1162,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); - filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX); + filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX); /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); @@ -1073,30 +1177,36 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); - /* - * invalidate meta page which is used temporarily for zeroing out - * block at the end of warm node chain. - */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, - discard_blk); - - release_dirty_inode(sbi); + release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); + clear_sbi_flag(sbi, SBI_NEED_CP); + + /* + * redirty superblock if metadata like node page or inode cache is + * updated during writing checkpoint. + */ + if (get_pages(sbi, F2FS_DIRTY_NODES) || + get_pages(sbi, F2FS_DIRTY_IMETA)) + set_sbi_flag(sbi, SBI_IS_DIRTY); + + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); + + return 0; } /* * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + int err = 0; mutex_lock(&sbi->cp_mutex); @@ -1104,21 +1214,35 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || (cpc->reason == CP_DISCARD && !sbi->discard_blks))) goto out; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto out; - if (f2fs_readonly(sbi->sb)) + } + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; goto out; + } trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); - if (block_operations(sbi)) + err = block_operations(sbi); + if (err) goto out; trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_flush_merged_bios(sbi); + + /* this is the case of multiple fstrims without any changes */ + if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { + f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); + f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); + f2fs_bug_on(sbi, prefree_segments(sbi)); + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } /* * update checkpoint pack index @@ -1133,7 +1257,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, cpc); + err = do_checkpoint(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1143,10 +1267,11 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) "checkpoint: version = %llx", ckpt_ver); /* do checkpoint periodically */ - sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval); + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: mutex_unlock(&sbi->cp_mutex); - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); + return err; } void init_ino_entry_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c deleted file mode 100644 index 4a62ef14e932..000000000000 --- a/fs/f2fs/crypto.c +++ /dev/null @@ -1,491 +0,0 @@ -/* - * linux/fs/f2fs/crypto.c - * - * Copied from linux/fs/ext4/crypto.c - * - * Copyright (C) 2015, Google, Inc. - * Copyright (C) 2015, Motorola Mobility - * - * This contains encryption functions for f2fs - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * Remove ext4_encrypted_zeroout(), - * add f2fs_restore_and_release_control_page() - * Jaegeuk Kim, 2015. - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -/* Encryption added and removed here! (L: */ - -static unsigned int num_prealloc_crypto_pages = 32; -static unsigned int num_prealloc_crypto_ctxs = 128; - -module_param(num_prealloc_crypto_pages, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_pages, - "Number of crypto pages to preallocate"); -module_param(num_prealloc_crypto_ctxs, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_ctxs, - "Number of crypto contexts to preallocate"); - -static mempool_t *f2fs_bounce_page_pool; - -static LIST_HEAD(f2fs_free_crypto_ctxs); -static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock); - -static struct workqueue_struct *f2fs_read_workqueue; -static DEFINE_MUTEX(crypto_init); - -static struct kmem_cache *f2fs_crypto_ctx_cachep; -struct kmem_cache *f2fs_crypt_info_cachep; - -/** - * f2fs_release_crypto_ctx() - Releases an encryption context - * @ctx: The encryption context to release. - * - * If the encryption context was allocated from the pre-allocated pool, returns - * it to that pool. Else, frees it. - * - * If there's a bounce page in the context, this frees that. - */ -void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx) -{ - unsigned long flags; - - if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) { - mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool); - ctx->w.bounce_page = NULL; - } - ctx->w.control_page = NULL; - if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { - kmem_cache_free(f2fs_crypto_ctx_cachep, ctx); - } else { - spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); - list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); - spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); - } -} - -/** - * f2fs_get_crypto_ctx() - Gets an encryption context - * @inode: The inode for which we are doing the crypto - * - * Allocates and initializes an encryption context. - * - * Return: An allocated and initialized encryption context on success; error - * value or NULL otherwise. - */ -struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode) -{ - struct f2fs_crypto_ctx *ctx = NULL; - unsigned long flags; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (ci == NULL) - return ERR_PTR(-ENOKEY); - - /* - * We first try getting the ctx from a free list because in - * the common case the ctx will have an allocated and - * initialized crypto tfm, so it's probably a worthwhile - * optimization. For the bounce page, we first try getting it - * from the kernel allocator because that's just about as fast - * as getting it from a list and because a cache of free pages - * should generally be a "last resort" option for a filesystem - * to be able to do its job. - */ - spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); - ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs, - struct f2fs_crypto_ctx, free_list); - if (ctx) - list_del(&ctx->free_list); - spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); - if (!ctx) { - ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS); - if (!ctx) - return ERR_PTR(-ENOMEM); - ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; - } - ctx->flags &= ~F2FS_WRITE_PATH_FL; - return ctx; -} - -/* - * Call f2fs_decrypt on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct f2fs_crypto_ctx *ctx = - container_of(work, struct f2fs_crypto_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = f2fs_decrypt(ctx, page); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else - SetPageUptodate(page); - unlock_page(page); - } - f2fs_release_crypto_ctx(ctx); - bio_put(bio); -} - -void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(f2fs_read_workqueue, &ctx->r.work); -} - -static void f2fs_crypto_destroy(void) -{ - struct f2fs_crypto_ctx *pos, *n; - - list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list) - kmem_cache_free(f2fs_crypto_ctx_cachep, pos); - INIT_LIST_HEAD(&f2fs_free_crypto_ctxs); - if (f2fs_bounce_page_pool) - mempool_destroy(f2fs_bounce_page_pool); - f2fs_bounce_page_pool = NULL; -} - -/** - * f2fs_crypto_initialize() - Set up for f2fs encryption. - * - * We only call this when we start accessing encrypted files, since it - * results in memory getting allocated that wouldn't otherwise be used. - * - * Return: Zero on success, non-zero otherwise. - */ -int f2fs_crypto_initialize(void) -{ - int i, res = -ENOMEM; - - if (f2fs_bounce_page_pool) - return 0; - - mutex_lock(&crypto_init); - if (f2fs_bounce_page_pool) - goto already_initialized; - - for (i = 0; i < num_prealloc_crypto_ctxs; i++) { - struct f2fs_crypto_ctx *ctx; - - ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL); - if (!ctx) - goto fail; - list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); - } - - /* must be allocated at the last step to avoid race condition above */ - f2fs_bounce_page_pool = - mempool_create_page_pool(num_prealloc_crypto_pages, 0); - if (!f2fs_bounce_page_pool) - goto fail; - -already_initialized: - mutex_unlock(&crypto_init); - return 0; -fail: - f2fs_crypto_destroy(); - mutex_unlock(&crypto_init); - return res; -} - -/** - * f2fs_exit_crypto() - Shutdown the f2fs encryption system - */ -void f2fs_exit_crypto(void) -{ - f2fs_crypto_destroy(); - - if (f2fs_read_workqueue) - destroy_workqueue(f2fs_read_workqueue); - if (f2fs_crypto_ctx_cachep) - kmem_cache_destroy(f2fs_crypto_ctx_cachep); - if (f2fs_crypt_info_cachep) - kmem_cache_destroy(f2fs_crypt_info_cachep); -} - -int __init f2fs_init_crypto(void) -{ - int res = -ENOMEM; - - f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0); - if (!f2fs_read_workqueue) - goto fail; - - f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx, - SLAB_RECLAIM_ACCOUNT); - if (!f2fs_crypto_ctx_cachep) - goto fail; - - f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info, - SLAB_RECLAIM_ACCOUNT); - if (!f2fs_crypt_info_cachep) - goto fail; - - return 0; -fail: - f2fs_exit_crypto(); - return res; -} - -void f2fs_restore_and_release_control_page(struct page **page) -{ - struct f2fs_crypto_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - f2fs_restore_control_page(bounce_page); -} - -void f2fs_restore_control_page(struct page *data_page) -{ - struct f2fs_crypto_ctx *ctx = - (struct f2fs_crypto_ctx *)page_private(data_page); - - set_page_private(data_page, (unsigned long)NULL); - ClearPagePrivate(data_page); - unlock_page(data_page); - f2fs_release_crypto_ctx(ctx); -} - -/** - * f2fs_crypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation - */ -static void f2fs_crypt_complete(struct crypto_async_request *req, int res) -{ - struct f2fs_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - -typedef enum { - F2FS_DECRYPT = 0, - F2FS_ENCRYPT, -} f2fs_direction_t; - -static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx, - struct inode *inode, - f2fs_direction_t rw, - pgoff_t index, - struct page *src_page, - struct page *dest_page) -{ - u8 xts_tweak[F2FS_XTS_TWEAK_SIZE]; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct scatterlist dst, src; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; - int res = 0; - - req = ablkcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", - __func__); - return -ENOMEM; - } - ablkcipher_request_set_callback( - req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_crypt_complete, &ecr); - - BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - F2FS_XTS_TWEAK_SIZE - sizeof(index)); - - sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); - sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); - ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, - xts_tweak); - if (rw == F2FS_DECRYPT) - res = crypto_ablkcipher_decrypt(req); - else - res = crypto_ablkcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } - ablkcipher_request_free(req); - if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_ablkcipher_encrypt() returned %d\n", - __func__, res); - return res; - } - return 0; -} - -static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx) -{ - ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT); - if (ctx->w.bounce_page == NULL) - return ERR_PTR(-ENOMEM); - ctx->flags |= F2FS_WRITE_PATH_FL; - return ctx->w.bounce_page; -} - -/** - * f2fs_encrypt() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. - * - * Called on the page write path. The caller must call - * f2fs_restore_control_page() on the returned ciphertext page to - * release the bounce buffer and the encryption context. - * - * Return: An allocated page with the encrypted content on success. Else, an - * error value or NULL. - */ -struct page *f2fs_encrypt(struct inode *inode, - struct page *plaintext_page) -{ - struct f2fs_crypto_ctx *ctx; - struct page *ciphertext_page = NULL; - int err; - - BUG_ON(!PageLocked(plaintext_page)); - - ctx = f2fs_get_crypto_ctx(inode); - if (IS_ERR(ctx)) - return (struct page *)ctx; - - /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx); - if (IS_ERR(ciphertext_page)) - goto err_out; - - ctx->w.control_page = plaintext_page; - err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page); - if (err) { - ciphertext_page = ERR_PTR(err); - goto err_out; - } - - SetPagePrivate(ciphertext_page); - set_page_private(ciphertext_page, (unsigned long)ctx); - lock_page(ciphertext_page); - return ciphertext_page; - -err_out: - f2fs_release_crypto_ctx(ctx); - return ciphertext_page; -} - -/** - * f2fs_decrypt() - Decrypts a page in-place - * @ctx: The encryption context. - * @page: The page to decrypt. Must be locked. - * - * Decrypts page in-place using the ctx encryption context. - * - * Called from the read completion callback. - * - * Return: Zero on success, non-zero otherwise. - */ -int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page) -{ - BUG_ON(!PageLocked(page)); - - return f2fs_page_crypto(ctx, page->mapping->host, - F2FS_DECRYPT, page->index, page, page); -} - -/* - * Convenience function which takes care of allocating and - * deallocating the encryption context - */ -int f2fs_decrypt_one(struct inode *inode, struct page *page) -{ - struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode); - int ret; - - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - ret = f2fs_decrypt(ctx, page); - f2fs_release_crypto_ctx(ctx); - return ret; -} - -bool f2fs_valid_contents_enc_mode(uint32_t mode) -{ - return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS); -} - -/** - * f2fs_validate_encryption_key_size() - Validate the encryption key size - * @mode: The key mode. - * @size: The key size to validate. - * - * Return: The validated key size for @mode. Zero if invalid. - */ -uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size) -{ - if (size == f2fs_encryption_key_size(mode)) - return size; - return 0; -} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8936044dee4c..fc9bfd8f566d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include "f2fs.h" @@ -33,11 +35,16 @@ static void f2fs_read_end_io(struct bio *bio) struct bio_vec *bvec; int i; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + bio->bi_error = -EIO; +#endif + if (f2fs_bio_encrypted(bio)) { if (bio->bi_error) { - f2fs_release_crypto_ctx(bio->bi_private); + fscrypt_release_ctx(bio->bi_private); } else { - f2fs_end_io_crypto_work(bio->bi_private, bio); + fscrypt_decrypt_bio_pages(bio->bi_private, bio); return; } } @@ -46,7 +53,8 @@ static void f2fs_read_end_io(struct bio *bio) struct page *page = bvec->bv_page; if (!bio->bi_error) { - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { ClearPageUptodate(page); SetPageError(page); @@ -65,19 +73,16 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - f2fs_restore_and_release_control_page(&page); + fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { - set_page_dirty(page); set_bit(AS_EIO, &page->mapping->flags); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, true); } end_page_writeback(page); - dec_page_count(sbi, F2FS_WRITEBACK); } - - if (!get_pages(sbi, F2FS_WRITEBACK) && - !list_empty(&sbi->cp_wait.task_list)) + if (atomic_dec_and_test(&sbi->nr_wb_bios) && + wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); @@ -101,6 +106,18 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } +static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, + struct bio *bio, enum page_type type) +{ + if (!is_read_io(rw)) { + atomic_inc(&sbi->nr_wb_bios); + if (f2fs_sb_mounted_hmsmr(sbi->sb) && + current->plug && (type == DATA || type == NODE)) + blk_finish_plug(current->plug); + } + submit_bio(rw, bio); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -113,12 +130,58 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - submit_bio(fio->rw, io->bio); + __submit_bio(io->sbi, fio->rw, io->bio, fio->type); io->bio = NULL; } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - enum page_type type, int rw) +static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, + struct page *page, nid_t ino) +{ + struct bio_vec *bvec; + struct page *target; + int i; + + if (!io->bio) + return false; + + if (!inode && !page && !ino) + return true; + + bio_for_each_segment_all(bvec, io->bio, i) { + + if (bvec->bv_page->mapping) + target = bvec->bv_page; + else + target = fscrypt_control_page(bvec->bv_page); + + if (inode && inode == target->mapping->host) + return true; + if (page && page == target) + return true; + if (ino && ino == ino_of_node(target)) + return true; + } + + return false; +} + +static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, + struct page *page, nid_t ino, + enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + bool ret; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, page, ino); + up_read(&io->io_rwsem); + return ret; +} + +static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, int rw) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io; @@ -127,6 +190,9 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, down_write(&io->io_rwsem); + if (!__has_merged_page(io, inode, page, ino)) + goto out; + /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; @@ -136,9 +202,31 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); +out: up_write(&io->io_rwsem); } +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, + int rw) +{ + __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); +} + +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, int rw) +{ + if (has_merged_page(sbi, inode, page, ino, type)) + __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); +} + +void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +{ + f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_bio(sbi, META, WRITE); +} + /* * Fill the locked page with data located in the block address. * Return unlocked page. @@ -146,20 +234,21 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; - struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + struct page *page = fio->encrypted_page ? + fio->encrypted_page : fio->page; trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } - submit_bio(fio->rw, bio); + __submit_bio(fio->sbi, fio->rw, bio, fio->type); return 0; } @@ -173,39 +262,49 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) io = is_read ? &sbi->read_io : &sbi->write_io[btype]; - verify_block_addr(sbi, fio->blk_addr); + if (fio->old_blkaddr != NEW_ADDR) + verify_block_addr(sbi, fio->old_blkaddr); + verify_block_addr(sbi, fio->new_blkaddr); down_write(&io->io_rwsem); - if (!is_read) - inc_page_count(sbi, F2FS_WRITEBACK); - - if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 || + if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || io->fio.rw != fio->rw)) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read); + io->bio = __bio_alloc(sbi, fio->new_blkaddr, + bio_blocks, is_read); io->fio = *fio; } bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < + PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } - io->last_block_in_bio = fio->blk_addr; + io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(fio->page, fio); } +static void __set_data_blkaddr(struct dnode_of_data *dn) +{ + struct f2fs_node *rn = F2FS_NODE(dn->node_page); + __le32 *addr_array; + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@ -214,39 +313,63 @@ alloc_new: */ void set_data_blkaddr(struct dnode_of_data *dn) { - struct f2fs_node *rn; - __le32 *addr_array; - struct page *node_page = dn->node_page; - unsigned int ofs_in_node = dn->ofs_in_node; - - f2fs_wait_on_page_writeback(node_page, NODE); - - rn = F2FS_NODE(node_page); - - /* Get physical address of data block */ - addr_array = blkaddr_in_node(rn); - addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - set_page_dirty(node_page); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + __set_data_blkaddr(dn); + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; } -int reserve_new_block(struct dnode_of_data *dn) +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) +{ + dn->data_blkaddr = blkaddr; + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); +} + +/* dn->ofs_in_node will be returned with up-to-date last block pointer */ +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (!count) + return 0; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; - trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, + dn->ofs_in_node, count); - dn->data_blkaddr = NEW_ADDR; - set_data_blkaddr(dn); - mark_inode_dirty(dn->inode); - sync_inode_page(dn); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + + for (; count > 0; dn->ofs_in_node++) { + block_t blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + if (blkaddr == NULL_ADDR) { + dn->data_blkaddr = NEW_ADDR; + __set_data_blkaddr(dn); + count--; + } + } + + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; return 0; } +/* Should keep dn->ofs_in_node unchanged */ +int reserve_new_block(struct dnode_of_data *dn) +{ + unsigned int ofs_in_node = dn->ofs_in_node; + int ret; + + ret = reserve_new_blocks(dn, 1); + dn->ofs_in_node = ofs_in_node; + return ret; +} + int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { bool need_put = dn->inode_page ? false : true; @@ -326,13 +449,14 @@ got_it: * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); return page; } - fio.blk_addr = dn.data_blkaddr; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; fio.page = page; err = f2fs_submit_page_bio(&fio); if (err) @@ -386,14 +510,14 @@ repeat: /* wait for read completion */ lock_page(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } return page; } @@ -413,7 +537,7 @@ struct page *get_new_data_page(struct inode *inode, struct page *page; struct dnode_of_data dn; int err; -repeat: + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* @@ -437,45 +561,42 @@ repeat: goto got_it; if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC, true); + /* if ipage exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ipage); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - goto repeat; - - /* wait for read completion */ - lock_page(page); + return page; } got_it: if (new_i_size && i_size_read(inode) < - ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) { - i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)); - /* Only the directory inode sets new_i_size */ - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + ((loff_t)(index + 1) << PAGE_SHIFT)) + f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); return page; } static int __allocate_data_block(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct f2fs_inode_info *fi = F2FS_I(dn->inode); struct f2fs_summary sum; struct node_info ni; int seg = CURSEG_WARM_DATA; pgoff_t fofs; + blkcnt_t count = 1; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; alloc: @@ -490,72 +611,43 @@ alloc: set_data_blkaddr(dn); /* update i_size */ - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; - if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) - i_size_write(dn->inode, - ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); - - /* direct IO doesn't use extent cache to maximize the performance */ - f2fs_drop_largest_extent(dn->inode, fofs); - + if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) + f2fs_i_size_write(dn->inode, + ((loff_t)(fofs + 1) << PAGE_SHIFT)); return 0; } -static void __allocate_data_blocks(struct inode *inode, loff_t offset, - size_t count) +ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dnode_of_data dn; - u64 start = F2FS_BYTES_TO_BLK(offset); - u64 len = F2FS_BYTES_TO_BLK(count); - bool allocated; - u64 end_offset; + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_map_blocks map; + ssize_t ret = 0; - while (len) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); + map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; - /* When reading holes, we need its node page */ - set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, ALLOC_NODE)) - goto out; + map.m_next_pgofs = NULL; - allocated = false; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); - - while (dn.ofs_in_node < end_offset && len) { - block_t blkaddr; - - if (unlikely(f2fs_cp_error(sbi))) - goto sync_out; - - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { - if (__allocate_data_block(&dn)) - goto sync_out; - allocated = true; - } - len--; - start++; - dn.ofs_in_node++; - } - - if (allocated) - sync_inode_page(&dn); - - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + if (iocb->ki_flags & IOCB_DIRECT) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); } - return; - -sync_out: - if (allocated) - sync_inode_page(&dn); - f2fs_put_dnode(&dn); -out: - f2fs_unlock_op(sbi); - return; + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + if (!f2fs_has_inline_data(inode)) + return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + return ret; } /* @@ -567,156 +659,181 @@ out: * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; - pgoff_t pgofs, end_offset; + int mode = create ? ALLOC_NODE : LOOKUP_NODE; + pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; + unsigned int ofs_in_node, last_ofs_in_node; + blkcnt_t prealloc; struct extent_info ei; bool allocated = false; + block_t blkaddr; + + if (!maxblocks) + return 0; map->m_len = 0; map->m_flags = 0; /* it only supports block size == page size */ pgofs = (pgoff_t)map->m_lblk; + end = pgofs + maxblocks; - if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { map->m_pblk = ei.blk + pgofs - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); map->m_flags = F2FS_MAP_MAPPED; goto out; } +next_dnode: if (create) - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { - if (err == -ENOENT) + if (flag == F2FS_GET_BLOCK_BMAP) + map->m_pblk = 0; + if (err == -ENOENT) { err = 0; + if (map->m_next_pgofs) + *map->m_next_pgofs = + get_next_page_offset(&dn, pgofs); + } goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { + prealloc = 0; + ofs_in_node = dn.ofs_in_node; + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + +next_block: + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; - goto put_out; + goto sync_out; + } + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } + } else { + err = __allocate_data_block(&dn); + if (!err) { + set_inode_flag(inode, FI_APPEND_WRITE); + allocated = true; + } } - err = __allocate_data_block(&dn); if (err) - goto put_out; - allocated = true; + goto sync_out; map->m_flags = F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; } else { - if (flag != F2FS_GET_BLOCK_FIEMAP || - dn.data_blkaddr != NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; - goto put_out; + if (flag == F2FS_GET_BLOCK_BMAP) { + map->m_pblk = 0; + goto sync_out; } - - /* - * preallocated unwritten block should be mapped - * for fiemap. - */ - if (dn.data_blkaddr == NEW_ADDR) - map->m_flags = F2FS_MAP_UNWRITTEN; + if (flag == F2FS_GET_BLOCK_FIEMAP && + blkaddr == NULL_ADDR) { + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + } + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; } } - map->m_flags |= F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - map->m_len = 1; + if (flag == F2FS_GET_BLOCK_PRE_AIO) + goto skip; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + if (map->m_len == 0) { + /* preallocated unwritten block should be mapped for fiemap. */ + if (blkaddr == NEW_ADDR) + map->m_flags |= F2FS_MAP_UNWRITTEN; + map->m_flags |= F2FS_MAP_MAPPED; + + map->m_pblk = blkaddr; + map->m_len = 1; + } else if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || + flag == F2FS_GET_BLOCK_PRE_DIO) { + ofs++; + map->m_len++; + } else { + goto sync_out; + } + +skip: dn.ofs_in_node++; pgofs++; -get_next: - if (dn.ofs_in_node >= end_offset) { - if (allocated) - sync_inode_page(&dn); - allocated = false; - f2fs_put_dnode(&dn); + /* preallocate blocks in batch for one dnode page */ + if (flag == F2FS_GET_BLOCK_PRE_AIO && + (pgofs == end || dn.ofs_in_node == end_offset)) { - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, mode); - if (err) { - if (err == -ENOENT) - err = 0; - goto unlock_out; + dn.ofs_in_node = ofs_in_node; + err = reserve_new_blocks(&dn, prealloc); + if (err) + goto sync_out; + allocated = dn.node_changed; + + map->m_len += dn.ofs_in_node - ofs_in_node; + if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { + err = -ENOSPC; + goto sync_out; } - - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + dn.ofs_in_node = end_offset; } - if (maxblocks > map->m_len) { - block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (pgofs >= end) + goto sync_out; + else if (dn.ofs_in_node < end_offset) + goto next_block; - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { - if (create) { - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; - goto sync_out; - } - err = __allocate_data_block(&dn); - if (err) - goto sync_out; - allocated = true; - map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; - } else { - /* - * we only merge preallocated unwritten blocks - * for fiemap. - */ - if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) - goto sync_out; - } - } + f2fs_put_dnode(&dn); - /* Give more consecutive addresses for the readahead */ - if ((map->m_pblk != NEW_ADDR && - blkaddr == (map->m_pblk + ofs)) || - (map->m_pblk == NEW_ADDR && - blkaddr == NEW_ADDR)) { - ofs++; - dn.ofs_in_node++; - pgofs++; - map->m_len++; - goto get_next; - } + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); } + allocated = false; + goto next_dnode; + sync_out: - if (allocated) - sync_inode_page(&dn); -put_out: f2fs_put_dnode(&dn); unlock_out: - if (create) - f2fs_unlock_op(F2FS_I_SB(inode)); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); + } out: trace_f2fs_map_blocks(inode, map, err); return err; } static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, int flag) + struct buffer_head *bh, int create, int flag, + pgoff_t *next_pgofs) { struct f2fs_map_blocks map; int ret; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; + map.m_next_pgofs = next_pgofs; ret = f2fs_map_blocks(inode, &map, create, flag); if (!ret) { @@ -728,23 +845,29 @@ static int __get_data_block(struct inode *inode, sector_t iblock, } static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, int flag) + struct buffer_head *bh_result, int create, int flag, + pgoff_t *next_pgofs) { - return __get_data_block(inode, iblock, bh_result, create, flag); + return __get_data_block(inode, iblock, bh_result, create, + flag, next_pgofs); } static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO); + F2FS_GET_BLOCK_DIO, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) + return -EFBIG; + return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_BMAP); + F2FS_GET_BLOCK_BMAP, NULL); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -762,10 +885,10 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { struct buffer_head map_bh; sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); + pgoff_t next_pgofs; + loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; - bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); @@ -778,82 +901,64 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); - if (len >= isize) { - whole_file = true; - len = isize; - } + isize = i_size_read(inode); + if (start >= isize) + goto out; + + if (start + len > isize) + len = isize - start; if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); + next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; ret = get_data_block(inode, start_blk, &map_bh, 0, - F2FS_GET_BLOCK_FIEMAP); + F2FS_GET_BLOCK_FIEMAP, &next_pgofs); if (ret) goto out; /* HOLE */ if (!buffer_mapped(&map_bh)) { - start_blk++; - - if (!past_eof && blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - if (past_eof && size) { - flags |= FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } - - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - goto out; - } else { - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - goto out; - } - - /* - * if size != 0 then we know we already have an extent - * to add, so add it. + start_blk = next_pgofs; + /* Go through holes util pass the EOF */ + if (blk_to_logical(inode, start_blk) < isize) + goto prep_next; + /* Found a hole beyond isize means no more extents. + * Note that the premise is that filesystems don't + * punch holes beyond isize and keep size unchanged. */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - if (ret) - goto out; - } - - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = 0; - if (buffer_unwritten(&map_bh)) - flags = FIEMAP_EXTENT_UNWRITTEN; - - start_blk += logical_to_blk(inode, size); - - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; + flags |= FIEMAP_EXTENT_LAST; } + + if (size) { + if (f2fs_encrypted_inode(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; + + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } + + if (start_blk > last_blk || ret) + goto out; + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; + + start_blk += logical_to_blk(inode, size); + +prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; @@ -863,10 +968,41 @@ out: if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } +static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct block_device *bdev = sbi->sb->s_bdev; + struct bio *bio; + + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + + return bio; +} + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -885,13 +1021,13 @@ static int f2fs_mpage_readpages(struct address_space *mapping, sector_t last_block; sector_t last_block_in_file; sector_t block_nr; - struct block_device *bdev = inode->i_sb->s_bdev; struct f2fs_map_blocks map; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; + map.m_next_pgofs = NULL; for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { @@ -930,7 +1066,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_READ)) goto set_error_page; } got_it: @@ -943,8 +1079,9 @@ got_it: goto confused; } } else { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); goto next_page; } @@ -955,35 +1092,15 @@ got_it: */ if (bio && (last_block_in_bio != block_nr - 1)) { submit_and_realloc: - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; } if (bio == NULL) { - struct f2fs_crypto_ctx *ctx = NULL; - - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { - - ctx = f2fs_get_crypto_ctx(inode); - if (IS_ERR(ctx)) - goto set_error_page; - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback( - F2FS_I_SB(inode), block_nr); - } - - bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - f2fs_release_crypto_ctx(ctx); + bio = f2fs_grab_bio(inode, block_nr, nr_pages); + if (IS_ERR(bio)) { + bio = NULL; goto set_error_page; } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -993,22 +1110,22 @@ submit_and_realloc: goto next_page; set_error_page: SetPageError(page); - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); goto next_page; confused: if (bio) { - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; } unlock_page(page); next_page: if (pages) - page_cache_release(page); + put_page(page); } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); return 0; } @@ -1055,23 +1172,33 @@ int do_write_data_page(struct f2fs_io_info *fio) if (err) return err; - fio->blk_addr = dn.data_blkaddr; + fio->old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ - if (fio->blk_addr == NULL_ADDR) { + if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); goto out_writepage; } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + gfp_t gfp_flags = GFP_NOFS; /* wait for GCed encrypted page writeback */ f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->blk_addr); - - fio->encrypted_page = f2fs_encrypt(inode, fio->page); + fio->old_blkaddr); +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); + if (err == -ENOMEM) { + /* flush pending ios and wait for a while */ + f2fs_flush_merged_bios(F2FS_I_SB(inode)); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + err = 0; + goto retry_encrypt; + } goto out_writepage; } } @@ -1082,20 +1209,19 @@ int do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->blk_addr != NEW_ADDR && + if (unlikely(fio->old_blkaddr != NEW_ADDR && !is_cold_data(page) && + !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { rewrite_data_page(fio); - set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + set_inode_flag(inode, FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); } else { write_data_page(&dn, fio); - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); } out_writepage: f2fs_put_dnode(&dn); @@ -1109,7 +1235,8 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) - >> PAGE_CACHE_SHIFT; + >> PAGE_SHIFT; + loff_t psize = (page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; @@ -1130,37 +1257,37 @@ static int f2fs_write_data_page(struct page *page, * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ - offset = i_size & (PAGE_CACHE_SIZE - 1); + offset = i_size & (PAGE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) goto out; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + zero_user_segment(page, offset, PAGE_SIZE); write: if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; - if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && - available_free_memory(sbi, BASE_CHECK)) + /* we should not write 0'th page having journal header */ + if (f2fs_is_volatile_file(inode) && (!page->index || + (!wbc->for_reclaim && + available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + goto out; + } + /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; err = do_write_data_page(&fio); goto done; } - /* we should bypass data pages to proceed the kworkder jobs */ - if (unlikely(f2fs_cp_error(sbi))) { - SetPageError(page); - goto out; - } - if (!wbc->for_reclaim) need_balance_fs = true; - else if (has_not_enough_free_secs(sbi, 0)) + else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; err = -EAGAIN; @@ -1169,6 +1296,8 @@ write: err = f2fs_write_inline_data(inode, page); if (err == -EAGAIN) err = do_write_data_page(&fio); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) @@ -1179,25 +1308,24 @@ out: inode_dec_dirty_pages(inode); if (err) ClearPageUptodate(page); + + if (wbc->for_reclaim) { + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + remove_dirty_inode(inode); + } + unlock_page(page); - if (need_balance_fs) - f2fs_balance_fs(sbi); - if (wbc->for_reclaim) + f2fs_balance_fs(sbi, need_balance_fs); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, DATA, WRITE); + return 0; redirty_out: redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - -static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; + unlock_page(page); + return err; } /* @@ -1206,8 +1334,7 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) + struct writeback_control *wbc) { int ret = 0; int done = 0; @@ -1220,10 +1347,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int cycled; int range_whole = 0; int tag; - int step = 0; + int nwritten = 0; pagevec_init(&pvec, 0); -next: + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1233,8 +1360,8 @@ next: cycled = 0; end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ @@ -1278,12 +1405,10 @@ continue_unlock: goto continue_unlock; } - if (step == is_cold_data(page)) - goto continue_unlock; - if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, + DATA, true); else goto continue_unlock; } @@ -1292,16 +1417,13 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = (*writepage)(page, wbc, data); + ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - done_index = page->index + 1; - done = 1; - break; - } + done_index = page->index + 1; + done = 1; + break; + } else { + nwritten++; } if (--wbc->nr_to_write <= 0 && @@ -1314,11 +1436,6 @@ continue_unlock: cond_resched(); } - if (step < 1) { - step++; - goto next; - } - if (!cycled && !done) { cycled = 1; index = 0; @@ -1328,6 +1445,10 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; + if (nwritten) + f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, + NULL, 0, DATA, WRITE); + return ret; } @@ -1336,11 +1457,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool locked = false; + struct blk_plug plug; int ret; - long diff; - - trace_f2fs_writepages(mapping->host, wbc, DATA); /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -1355,41 +1473,119 @@ static int f2fs_write_data_pages(struct address_space *mapping, available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; + /* skip writing during file defragment */ + if (is_inode_flag_set(inode, FI_DO_DEFRAG)) + goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; - diff = nr_pages_to_write(sbi, DATA, wbc); + trace_f2fs_writepages(mapping->host, wbc, DATA); - if (!S_ISDIR(inode->i_mode)) { - mutex_lock(&sbi->writepages); - locked = true; - } - ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - if (locked) - mutex_unlock(&sbi->writepages); + blk_start_plug(&plug); + ret = f2fs_write_cache_pages(mapping, wbc); + blk_finish_plug(&plug); + /* + * if some pages were truncated, we cannot guarantee its mapping->host + * to detect pending bios. + */ - remove_dirty_dir_inode(inode); - - wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + remove_dirty_inode(inode); return ret; skip_write: wbc->pages_skipped += get_dirty_pages(inode); + trace_f2fs_writepages(mapping->host, wbc, DATA); return 0; } static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + loff_t i_size = i_size_read(inode); - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - truncate_blocks(inode, inode->i_size, true); + if (to > i_size) { + truncate_pagecache(inode, i_size); + truncate_blocks(inode, i_size, true); } } +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + pgoff_t index = page->index; + struct dnode_of_data dn; + struct page *ipage; + bool locked = false; + struct extent_info ei; + int err = 0; + + /* + * we already allocated all the blocks, so we don't need to get + * the block addresses when there is no need to fill the page. + */ + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + return 0; + + if (f2fs_has_inline_data(inode) || + (pos & PAGE_MASK) >= i_size_read(inode)) { + f2fs_lock_op(sbi); + locked = true; + } +restart: + /* check inline_data */ + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA) { + read_inline_data(page, ipage); + set_inode_flag(inode, FI_DATA_EXIST); + if (inode->i_nlink) + set_inline_node(ipage); + } else { + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto out; + if (dn.data_blkaddr == NULL_ADDR) + err = f2fs_get_block(&dn, index); + } + } else if (locked) { + err = f2fs_get_block(&dn, index); + } else { + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err || dn.data_blkaddr == NULL_ADDR) { + f2fs_put_dnode(&dn); + f2fs_lock_op(sbi); + locked = true; + goto restart; + } + } + } + + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; +out: + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + f2fs_unlock_op(sbi); + return err; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1397,9 +1593,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; - struct page *ipage; - pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; - struct dnode_of_data dn; + pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; + bool need_balance = false; + block_t blkaddr = NULL_ADDR; int err = 0; if (trace_android_fs_datawrite_start_enabled()) { @@ -1414,8 +1610,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, } trace_f2fs_write_begin(inode, pos, len, flags); - f2fs_balance_fs(sbi); - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -1435,98 +1629,63 @@ repeat: *pagep = page; - f2fs_lock_op(sbi); - - /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto unlock_fail; - } - - set_new_dnode(&dn, inode, ipage, ipage, 0); - - if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { - read_inline_data(page, ipage); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - sync_inode_page(&dn); - goto put_next; - } - err = f2fs_convert_inline_page(&dn, page); - if (err) - goto put_fail; - } - - err = f2fs_get_block(&dn, index); + err = prepare_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); if (err) - goto put_fail; -put_next: - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + goto fail; - f2fs_wait_on_page_writeback(page, DATA); + if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) { + unlock_page(page); + f2fs_balance_fs(sbi, true); + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + f2fs_put_page(page, 1); + goto repeat; + } + } + + f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - if (len == PAGE_CACHE_SIZE) - goto out_update; - if (PageUptodate(page)) - goto out_clear; + if (len == PAGE_SIZE || PageUptodate(page)) + return 0; - if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { - unsigned start = pos & (PAGE_CACHE_SIZE - 1); - unsigned end = start + len; - - /* Reading beyond i_size is simple: memset to zero */ - zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - goto out_update; - } - - if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + if (blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); } else { - struct f2fs_io_info fio = { - .sbi = sbi, - .type = DATA, - .rw = READ_SYNC, - .blk_addr = dn.data_blkaddr, - .page = page, - .encrypted_page = NULL, - }; - err = f2fs_submit_page_bio(&fio); - if (err) - goto fail; + struct bio *bio; - lock_page(page); - if (unlikely(!PageUptodate(page))) { - err = -EIO; + bio = f2fs_grab_bio(inode, blkaddr, 1); + if (IS_ERR(bio)) { + err = PTR_ERR(bio); goto fail; } + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + err = -EFAULT; + goto fail; + } + + __submit_bio(sbi, READ_SYNC, bio, DATA); + + lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } - - /* avoid symlink page */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - err = f2fs_decrypt_one(inode, page); - if (err) - goto fail; + if (unlikely(!PageUptodate(page))) { + err = -EIO; + goto fail; } } -out_update: - SetPageUptodate(page); -out_clear: - clear_cold_data(page); return 0; -put_fail: - f2fs_put_dnode(&dn); -unlock_fail: - f2fs_unlock_op(sbi); fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); @@ -1543,15 +1702,28 @@ static int f2fs_write_end(struct file *file, trace_android_fs_datawrite_end(inode, pos, len); trace_f2fs_write_end(inode, pos, len, copied); - set_page_dirty(page); - - if (pos + copied > i_size_read(inode)) { - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); - update_inode_page(inode); + /* + * This should be come from len == PAGE_SIZE, and we expect copied + * should be PAGE_SIZE. Otherwise, we treat it with zero copied and + * let generic_perform_write() try to copy data again through copied=0. + */ + if (!PageUptodate(page)) { + if (unlikely(copied != PAGE_SIZE)) + copied = 0; + else + SetPageUptodate(page); } + if (!copied) + goto unlock_out; + set_page_dirty(page); + clear_cold_data(page); + + if (pos + copied > i_size_read(inode)) + f2fs_i_size_write(inode, pos + copied); +unlock_out: f2fs_put_page(page, 1); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } @@ -1570,29 +1742,22 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, } static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) + loff_t offset) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; + struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + int rw = iov_iter_rw(iter); int err; - /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - err = check_direct_IO(inode, iter, offset); if (err) return err; - trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return 0; + if (test_opt(F2FS_I_SB(inode), LFS)) + return 0; if (trace_android_fs_dataread_start_enabled() && (iov_iter_rw(iter) == READ)) { @@ -1616,18 +1781,18 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, current->pid, path, current->comm); } - if (iov_iter_rw(iter) == WRITE) { - __allocate_data_blocks(inode, offset, count); - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { - err = -EIO; - goto out; - } - } + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + down_read(&F2FS_I(inode)->dio_rwsem[rw]); err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); -out: - if (err < 0 && iov_iter_rw(iter) == WRITE) - f2fs_write_failed(mapping, offset + count); + up_read(&F2FS_I(inode)->dio_rwsem[rw]); + + if (rw == WRITE) { + if (err > 0) + set_inode_flag(inode, FI_UPDATE_WRITE); + else if (err < 0) + f2fs_write_failed(mapping, offset + count); + } if (trace_android_fs_dataread_start_enabled() && (iov_iter_rw(iter) == READ)) @@ -1636,7 +1801,7 @@ out: (iov_iter_rw(iter) == WRITE)) trace_android_fs_datawrite_end(inode, offset, count); - trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; } @@ -1648,7 +1813,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino >= F2FS_ROOT_INO(sbi) && - (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)) + (offset % PAGE_SIZE || length != PAGE_SIZE)) return; if (PageDirty(page)) { @@ -1664,6 +1829,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, if (IS_ATOMIC_WRITTEN_PAGE(page)) return; + set_page_private(page, 0); ClearPagePrivate(page); } @@ -1677,10 +1843,42 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (IS_ATOMIC_WRITTEN_PAGE(page)) return 0; + set_page_private(page, 0); ClearPagePrivate(page); return 1; } +/* + * This was copied from __set_page_dirty_buffers which gives higher performance + * in very high speed storages. (e.g., pmem) + */ +void f2fs_set_page_dirty_nobuffers(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct mem_cgroup *memcg; + unsigned long flags; + + if (unlikely(!mapping)) + return; + + spin_lock(&mapping->private_lock); + memcg = mem_cgroup_begin_page_stat(page); + SetPageDirty(page); + spin_unlock(&mapping->private_lock); + + spin_lock_irqsave(&mapping->tree_lock, flags); + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, mapping, memcg); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + mem_cgroup_end_page_stat(memcg); + + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return; +} + static int f2fs_set_data_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -1688,7 +1886,8 @@ static int f2fs_set_data_page_dirty(struct page *page) trace_f2fs_set_page_dirty(page, DATA); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (f2fs_is_atomic_file(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { @@ -1703,7 +1902,7 @@ static int f2fs_set_data_page_dirty(struct page *page) } if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); update_dirty_page(inode, page); return 1; } @@ -1724,6 +1923,58 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, get_data_block_bmap); } +#ifdef CONFIG_MIGRATION +#include + +int f2fs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + int rc, extra_count; + struct f2fs_inode_info *fi = F2FS_I(mapping->host); + bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + + BUG_ON(PageWriteback(page)); + + /* migrating an atomic written page is safe with the inmem_lock hold */ + if (atomic_written && !mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + + /* + * A reference is expected if PagePrivate set when move mapping, + * however F2FS breaks this for maintaining dirty page counts when + * truncating pages. So here adjusting the 'extra_count' make it work. + */ + extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + rc = migrate_page_move_mapping(mapping, newpage, + page, NULL, mode, extra_count); + if (rc != MIGRATEPAGE_SUCCESS) { + if (atomic_written) + mutex_unlock(&fi->inmem_lock); + return rc; + } + + if (atomic_written) { + struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) + if (cur->page == page) { + cur->page = newpage; + break; + } + mutex_unlock(&fi->inmem_lock); + put_page(page); + get_page(newpage); + } + + if (PagePrivate(page)) + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + + migrate_page_copy(newpage, page); + + return MIGRATEPAGE_SUCCESS; +} +#endif + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -1736,4 +1987,7 @@ const struct address_space_operations f2fs_dblock_aops = { .releasepage = f2fs_release_page, .direct_IO = f2fs_direct_IO, .bmap = f2fs_bmap, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 24d6a51b48d1..1c35e80732e0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -38,23 +38,30 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = sbi->total_ext_tree; + si->ext_tree = atomic_read(&sbi->total_ext_tree); + si->zombie_tree = atomic_read(&sbi->total_zombie_tree); si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); - si->ndirty_dirs = sbi->n_dirty_dirs; si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); + si->wb_bios = atomic_read(&sbi->nr_wb_bios); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); si->valid_count = valid_user_blocks(sbi); + si->discard_blks = discard_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); @@ -105,7 +112,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); + blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); @@ -140,6 +147,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); + si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; /* build sm */ si->base_mem += sizeof(struct f2fs_sm_info); @@ -148,7 +156,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct sit_info); si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); - si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + if (f2fs_discard_en(sbi)) + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); @@ -161,7 +171,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; - si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE; + si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE; /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); @@ -189,18 +199,18 @@ get_cache: si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i <= ORPHAN_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree); si->cache_mem += atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node); si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; npages = META_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } static int stat_show(struct seq_file *s, void *v) @@ -211,20 +221,24 @@ static int stat_show(struct seq_file *s, void *v) mutex_lock(&f2fs_stat_mutex); list_for_each_entry(si, &f2fs_stat_list, stat_list) { - char devname[BDEVNAME_SIZE]; - update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", - bdevname(si->sbi->sb->s_bdev, devname), i++); + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + si->sbi->sb->s_bdev, i++, + f2fs_readonly(si->sbi->sb) ? "RO": "RW"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", si->overp_segs, si->rsvd_segs); - seq_printf(s, "Utilization: %d%% (%d valid blocks)\n", - si->utilization, si->valid_count); + if (test_opt(si->sbi, DISCARD)) + seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", + si->utilization, si->valid_count, si->discard_blks); + else + seq_printf(s, "Utilization: %u%% (%u valid blocks)\n", + si->utilization, si->valid_count); + seq_printf(s, " - Node: %u (Inode: %u, ", si->valid_node_count, si->valid_inode_count); seq_printf(s, "Other: %u)\n - Data: %u\n", @@ -236,6 +250,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); + seq_printf(s, " - Orphan Inode: %u\n", + si->orphans); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -269,7 +285,8 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); - seq_printf(s, "CP calls: %d\n", si->cp_count); + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_count, si->bg_cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", @@ -290,17 +307,21 @@ static int stat_show(struct seq_file *s, void *v) !si->total_ext ? 0 : div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); - seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", - si->ext_tree, si->ext_node); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb: %4d\n", - si->inmem_pages, si->wb_pages); - seq_printf(s, " - nodes: %4d in %4d\n", + seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + si->inmem_pages, si->wb_bios); + seq_printf(s, " - nodes: %4lld in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4d in dirs:%4d\n", - si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - meta: %4d in %4d\n", + seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); + seq_printf(s, " - datas: %4lld in files:%4d\n", + si->ndirty_data, si->ndirty_files); + seq_printf(s, " - meta: %4lld in %4d\n", si->ndirty_meta, si->meta_pages); + seq_printf(s, " - imeta: %4lld\n", + si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d\n", @@ -407,20 +428,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kfree(si); } -void __init f2fs_create_root_stats(void) +int __init f2fs_create_root_stats(void) { struct dentry *file; f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - return; + return -ENOMEM; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); if (!file) { debugfs_remove(f2fs_debugfs_root); f2fs_debugfs_root = NULL; + return -ENOMEM; } + + return 0; } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 60972a559685..e634a637c443 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -17,8 +17,8 @@ static unsigned long dir_blocks(struct inode *inode) { - return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1)) - >> PAGE_CACHE_SHIFT; + return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) + >> PAGE_SHIFT; } static unsigned int dir_buckets(unsigned int level, int dir_level) @@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } -unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { +static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_UNKNOWN] = DT_UNKNOWN, [F2FS_FT_REG_FILE] = DT_REG, [F2FS_FT_DIR] = DT_DIR, @@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_SYMLINK] = DT_LNK, }; -#define S_SHIFT 12 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, @@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } +unsigned char get_de_type(struct f2fs_dir_entry *de) +{ + if (de->file_type < F2FS_FT_MAX) + return f2fs_filetype_table[de->file_type]; + return DT_UNKNOWN; +} + static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { @@ -77,7 +83,7 @@ static unsigned long dir_block_index(unsigned int level, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - struct f2fs_filename *fname, + struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct page **res_page) @@ -95,23 +101,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, else kunmap(dentry_page); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0); return de; } -struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d) { struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct f2fs_str de_name = FSTR_INIT(NULL, 0); - struct f2fs_str *name = &fname->disk_name; + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); + struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -124,37 +125,28 @@ struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, de = &d->dentry[bit_pos]; - if (de->hash_code != namehash) - goto not_match; + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (unlikely(!name->name)) { - if (fname->usr_fname->name[0] == '_') { - if (de_name.len > 32 && - !memcmp(de_name.name + ((de_name.len - 17) & ~15), - fname->crypto_buf.name + 8, 16)) - goto found; - goto not_match; - } - name->name = fname->crypto_buf.name; - name->len = fname->crypto_buf.len; - } -#endif - if (de_name.len == name->len && - !memcmp(de_name.name, name->name, name->len)) + /* show encrypted name */ + if (fname->hash) { + if (de->hash_code == fname->hash) + goto found; + } else if (de_name.len == name->len && + de->hash_code == namehash && + !memcmp(de_name.name, name->name, name->len)) goto found; -not_match: + if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; - /* remain bug on condition */ - if (unlikely(!de->name_len)) - d->max = -1; - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } @@ -167,7 +159,7 @@ found: static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int level, - struct f2fs_filename *fname, + struct fscrypt_name *fname, struct page **res_page) { struct qstr name = FSTR_TO_QSTR(&fname->disk_name); @@ -180,9 +172,10 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, int max_slots; f2fs_hash_t namehash; - namehash = f2fs_dentry_hash(&name, fname); - - f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); + if(fname->hash) + namehash = cpu_to_le32(fname->hash); + else + namehash = f2fs_dentry_hash(&name); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -195,8 +188,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, /* no need to allocate new dentry pages to all the indices */ dentry_page = find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { - room = true; - continue; + if (PTR_ERR(dentry_page) == -ENOENT) { + room = true; + continue; + } else { + *res_page = dentry_page; + break; + } } de = find_in_block(dentry_page, fname, namehash, &max_slots, @@ -217,6 +215,44 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, return de; } +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page) +{ + unsigned long npages = dir_blocks(dir); + struct f2fs_dir_entry *de = NULL; + unsigned int max_depth; + unsigned int level; + + if (f2fs_has_inline_dentry(dir)) { + *res_page = NULL; + de = find_in_inline_dir(dir, fname, res_page); + goto out; + } + + if (npages == 0) { + *res_page = NULL; + goto out; + } + + max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING, + "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + f2fs_i_depth_write(dir, max_depth); + } + + for (level = 0; level < max_depth; level++) { + *res_page = NULL; + de = find_in_level(dir, level, fname, res_page); + if (de || IS_ERR(*res_page)) + break; + } +out: + return de; +} + /* * Find an entry in the specified directory with the wanted name. * It returns the page where the entry was found (as a parameter - res_page), @@ -224,72 +260,42 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, * Entry is guaranteed to be valid. */ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, - struct qstr *child, struct page **res_page) + const struct qstr *child, struct page **res_page) { - unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; - unsigned int max_depth; - unsigned int level; - struct f2fs_filename fname; + struct fscrypt_name fname; int err; - *res_page = NULL; - - err = f2fs_fname_setup_filename(dir, child, 1, &fname); - if (err) + err = fscrypt_setup_filename(dir, child, 1, &fname); + if (err) { + *res_page = ERR_PTR(err); return NULL; - - if (f2fs_has_inline_dentry(dir)) { - de = find_in_inline_dir(dir, &fname, res_page); - goto out; } - if (npages == 0) - goto out; + de = __f2fs_find_entry(dir, &fname, res_page); - max_depth = F2FS_I(dir)->i_current_depth; - - for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, &fname, res_page); - if (de) - break; - } -out: - f2fs_fname_free_filename(&fname); + fscrypt_free_filename(&fname); return de; } struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct page *page; - struct f2fs_dir_entry *de; - struct f2fs_dentry_block *dentry_blk; + struct qstr dotdot = QSTR_INIT("..", 2); - if (f2fs_has_inline_dentry(dir)) - return f2fs_parent_inline_dir(dir, p); - - page = get_lock_data_page(dir, 0, false); - if (IS_ERR(page)) - return NULL; - - dentry_blk = kmap(page); - de = &dentry_blk->dentry[1]; - *p = page; - unlock_page(page); - return de; + return f2fs_find_entry(dir, &dotdot, p); } -ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page) { ino_t res = 0; struct f2fs_dir_entry *de; - struct page *page; - de = f2fs_find_entry(dir, qstr, &page); + de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); - f2fs_put_page(page, 0); + f2fs_dentry_kunmap(dir, *page); + f2fs_put_page(*page, 0); } return res; @@ -300,14 +306,14 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; lock_page(page); - f2fs_wait_on_page_writeback(page, type); + f2fs_wait_on_page_writeback(page, type, true); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode->i_mode); f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -315,7 +321,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); @@ -345,24 +351,14 @@ int update_dent_inode(struct inode *inode, struct inode *to, void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { - struct f2fs_dir_entry *de; + struct qstr dot = QSTR_INIT(".", 1); + struct qstr dotdot = QSTR_INIT("..", 2); - de = &d->dentry[0]; - de->name_len = cpu_to_le16(1); - de->hash_code = 0; - de->ino = cpu_to_le32(inode->i_ino); - memcpy(d->filename[0], ".", 1); - set_de_type(de, inode->i_mode); + /* update dirent of "." */ + f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); - de = &d->dentry[1]; - de->hash_code = 0; - de->name_len = cpu_to_le16(2); - de->ino = cpu_to_le32(parent->i_ino); - memcpy(d->filename[1], "..", 2); - set_de_type(de, parent->i_mode); - - test_and_set_bit_le(0, (void *)d->bitmap); - test_and_set_bit_le(1, (void *)d->bitmap); + /* update dirent of ".." */ + f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1); } static int make_empty_dir(struct inode *inode, @@ -392,32 +388,38 @@ static int make_empty_dir(struct inode *inode, } struct page *init_inode_metadata(struct inode *inode, struct inode *dir, - const struct qstr *name, struct page *dpage) + const struct qstr *new_name, const struct qstr *orig_name, + struct page *dpage) { struct page *page; int err; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (is_inode_flag_set(inode, FI_NEW_INODE)) { page = new_inode_page(inode); if (IS_ERR(page)) return page; if (S_ISDIR(inode->i_mode)) { + /* in order to handle error case */ + get_page(page); err = make_empty_dir(inode, dir, page); - if (err) - goto error; + if (err) { + lock_page(page); + goto put_error; + } + put_page(page); } err = f2fs_init_acl(inode, dir, page, dpage); if (err) goto put_error; - err = f2fs_init_security(inode, dir, name, page); + err = f2fs_init_security(inode, dir, orig_name, page); if (err) goto put_error; if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { - err = f2fs_inherit_context(dir, inode, page); + err = fscrypt_inherit_context(dir, inode, page, false); if (err) goto put_error; } @@ -429,14 +431,14 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (name) - init_dent_inode(name, page); + if (new_name) + init_dent_inode(new_name, page); /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + if (is_inode_flag_set(inode, FI_INC_LINK)) { file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, @@ -444,41 +446,33 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, */ if (inode->i_nlink == 0) remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); - inc_nlink(inode); + f2fs_i_links_write(inode, true); } return page; put_error: + clear_nlink(inode); + update_inode(inode, page); f2fs_put_page(page, 1); -error: - /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ - truncate_inode_pages(&inode->i_data, 0); - truncate_blocks(inode, 0, false); - remove_dirty_dir_inode(inode); - remove_inode_page(inode); return ERR_PTR(err); } void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - if (S_ISDIR(inode->i_mode)) { - inc_nlink(dir); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, true); + clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + f2fs_mark_inode_dirty_sync(dir); - if (F2FS_I(dir)->i_current_depth != current_depth) { - F2FS_I(dir)->i_current_depth = current_depth; - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } + if (F2FS_I(dir)->i_current_depth != current_depth) + f2fs_i_depth_write(dir, current_depth); - if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + if (inode && is_inode_flag_set(inode, FI_INC_LINK)) + clear_inode_flag(inode, FI_INC_LINK); } int room_for_filename(const void *bitmap, int slots, int max_slots) @@ -515,15 +509,16 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, memcpy(d->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(ino); set_de_type(de, mode); - for (i = 0; i < slots; i++) - test_and_set_bit_le(bit_pos + i, (void *)d->bitmap); + for (i = 0; i < slots; i++) { + __set_bit_le(bit_pos + i, (void *)d->bitmap); + /* avoid wrong garbage data for readdir */ + if (i) + (de + i)->name_len = 0; + } } -/* - * Caller should grab and release a rwsem by calling f2fs_lock_op() and - * f2fs_unlock_op(). - */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; @@ -536,28 +531,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; struct page *page = NULL; - struct f2fs_filename fname; - struct qstr new_name; - int slots, err; - - err = f2fs_fname_setup_filename(dir, name, 0, &fname); - if (err) - return err; - - new_name.name = fname_name(&fname); - new_name.len = fname_len(&fname); - - if (f2fs_has_inline_dentry(dir)) { - err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); - if (!err || err != -EAGAIN) - goto out; - else - err = 0; - } + int slots, err = 0; level = 0; - slots = GET_DENTRY_SLOTS(new_name.len); - dentry_hash = f2fs_dentry_hash(&new_name, NULL); + slots = GET_DENTRY_SLOTS(new_name->len); + dentry_hash = f2fs_dentry_hash(new_name); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -566,10 +544,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } start: - if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) { - err = -ENOSPC; - goto out; - } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + return -ENOSPC; +#endif + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) + return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) @@ -583,10 +563,8 @@ start: for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) { - err = PTR_ERR(dentry_page); - goto out; - } + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, @@ -602,11 +580,12 @@ start: ++level; goto start; add_dentry: - f2fs_wait_on_page_writeback(dentry_page, DATA); + f2fs_wait_on_page_writeback(dentry_page, DATA, true); if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, &new_name, NULL); + page = init_inode_metadata(inode, dir, new_name, + orig_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -616,14 +595,12 @@ add_dentry: } make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); - f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); if (inode) { - /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -632,14 +609,49 @@ fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode_page(dir); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); -out: - f2fs_fname_free_filename(&fname); + + return err; +} + +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct qstr new_name; + int err = -EAGAIN; + + new_name.name = fname_name(fname); + new_name.len = fname_len(fname); + + if (f2fs_has_inline_dentry(dir)) + err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + return err; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct fscrypt_name fname; + int err; + + err = fscrypt_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + + fscrypt_free_filename(&fname); return err; } @@ -649,46 +661,39 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) int err = 0; down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, NULL, NULL); + page = init_inode_metadata(inode, dir, NULL, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } - /* we don't need to mark_inode_dirty now */ - update_inode(inode, page); f2fs_put_page(page, 1); - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + clear_inode_flag(inode, FI_NEW_INODE); fail: up_write(&F2FS_I(inode)->i_sem); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } -void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page) +void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); down_write(&F2FS_I(inode)->i_sem); - if (S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - if (page) - update_inode(dir, page); - else - update_inode_page(dir); - } + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, false); inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); + f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { - drop_nlink(inode); - i_size_write(inode, 0); + f2fs_i_links_write(inode, false); + f2fs_i_size_write(inode, 0); } up_write(&F2FS_I(inode)->i_sem); - update_inode_page(inode); if (inode->i_nlink == 0) - add_orphan_inode(sbi, inode->i_ino); + add_orphan_inode(inode); else release_orphan_inode(sbi); } @@ -705,11 +710,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); lock_page(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; @@ -724,9 +731,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, NULL); + f2fs_drop_nlink(dir, inode); if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { @@ -777,12 +785,12 @@ bool f2fs_empty_dir(struct inode *dir) } bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, - unsigned int start_pos, struct f2fs_str *fstr) + unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; - struct f2fs_str de_name = FSTR_INIT(NULL, 0); + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); bit_pos = ((unsigned long)ctx->pos % d->max); @@ -792,10 +800,13 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, break; de = &d->dentry[bit_pos]; - if (de->file_type < F2FS_FT_MAX) - d_type = f2fs_filetype_table[de->file_type]; - else - d_type = DT_UNKNOWN; + if (de->name_len == 0) { + bit_pos++; + ctx->pos = start_pos + bit_pos; + continue; + } + + d_type = get_de_type(de); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -804,15 +815,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; - de_name.name = kmalloc(de_name.len, GFP_NOFS); - if (!de_name.name) - return false; - - memcpy(de_name.name, d->filename[bit_pos], de_name.len); - - ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, - &de_name, fstr); - kfree(de_name.name); + ret = fscrypt_fname_disk_to_usr(d->inode, + (u32)de->hash_code, 0, + &de_name, fstr); if (ret < 0) return true; @@ -839,16 +844,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct file_ra_state *ra = &file->f_ra; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; - struct f2fs_str fstr = FSTR_INIT(NULL, 0); + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); int err = 0; if (f2fs_encrypted_inode(inode)) { - err = f2fs_get_encryption_info(inode); - if (err) + err = fscrypt_get_encryption_info(inode); + if (err && err != -ENOKEY) return err; - err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN, - &fstr); + err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); if (err < 0) return err; } @@ -865,36 +869,47 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n, false); - if (IS_ERR(dentry_page)) - continue; + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + if (err == -ENOENT) + continue; + else + goto out; + } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) - goto stop; + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + break; + } ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - dentry_page = NULL; - } -stop: - if (dentry_page && !IS_ERR(dentry_page)) { - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); } + err = 0; out: - f2fs_fname_crypto_free_buffer(&fstr); + fscrypt_fname_free_buffer(&fstr); return err; } +static int f2fs_dir_open(struct inode *inode, struct file *filp) +{ + if (f2fs_encrypted_inode(inode)) + return fscrypt_get_encryption_info(inode) ? -EACCES : 0; + return 0; +} + const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate = f2fs_readdir, .fsync = f2fs_sync_file, + .open = f2fs_dir_open, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 7ddba812e11b..2b06d4fcd954 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -33,10 +33,11 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, en->ei = *ei; INIT_LIST_HEAD(&en->list); + en->et = et; rb_link_node(&en->rb_node, parent, p); rb_insert_color(&en->rb_node, &et->root); - et->count++; + atomic_inc(&et->node_cnt); atomic_inc(&sbi->total_ext_node); return en; } @@ -45,11 +46,29 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { rb_erase(&en->rb_node, &et->root); - et->count--; + atomic_dec(&et->node_cnt); atomic_dec(&sbi->total_ext_node); if (et->cached_en == en) et->cached_en = NULL; + kmem_cache_free(extent_node_slab, en); +} + +/* + * Flow to release an extent_node: + * 1. list_del_init + * 2. __detach_extent_node + * 3. kmem_cache_free. + */ +static void __release_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + spin_lock(&sbi->extent_lock); + f2fs_bug_on(sbi, list_empty(&en->list)); + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + + __detach_extent_node(sbi, et, en); } static struct extent_tree *__grab_extent_tree(struct inode *inode) @@ -68,11 +87,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) et->root = RB_ROOT; et->cached_en = NULL; rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&sbi->total_ext_tree); + } else { + atomic_dec(&sbi->total_zombie_tree); + list_del_init(&et->list); } - atomic_inc(&et->refcount); up_write(&sbi->extent_tree_lock); /* never died until evict_inode */ @@ -127,32 +148,21 @@ static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, } static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, bool free_all) + struct extent_tree *et) { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = et->count; + unsigned int count = atomic_read(&et->node_cnt); node = rb_first(&et->root); while (node) { next = rb_next(node); en = rb_entry(node, struct extent_node, rb_node); - - if (free_all) { - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) - list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); - } - - if (free_all || list_empty(&en->list)) { - __detach_extent_node(sbi, et, en); - kmem_cache_free(extent_node_slab, en); - } + __release_extent_node(sbi, et, en); node = next; } - return count - et->count; + return count - atomic_read(&et->node_cnt); } static void __drop_largest_extent(struct inode *inode, @@ -160,38 +170,38 @@ static void __drop_largest_extent(struct inode *inode, { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) + if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; + f2fs_mark_inode_dirty_sync(inode); + } } -void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) -{ - if (!f2fs_may_extent_tree(inode)) - return; - - __drop_largest_extent(inode, fofs, 1); -} - -void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +/* return true, if inode page is changed */ +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) - return; + if (!f2fs_may_extent_tree(inode)) { + /* drop largest extent */ + if (i_ext && i_ext->len) { + i_ext->len = 0; + return true; + } + return false; + } et = __grab_extent_tree(inode); - if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; + if (!i_ext || !i_ext->len) + return false; - set_extent_info(&ei, le32_to_cpu(i_ext->fofs), - le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + get_extent_info(&ei, i_ext); write_lock(&et->lock); - if (et->count) + if (atomic_read(&et->node_cnt)) goto out; en = __init_extent_tree(sbi, et, &ei); @@ -202,6 +212,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) } out: write_unlock(&et->lock); + return false; } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -230,9 +241,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, if (en) { *ei = en->ei; spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) + if (!list_empty(&en->list)) { list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; + et->cached_en = en; + } spin_unlock(&sbi->extent_lock); ret = true; } @@ -325,12 +337,12 @@ lookup_neighbors: return en; } -static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, +static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, - struct extent_node **den, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_node *en = NULL; if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { @@ -340,28 +352,34 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) { - __detach_extent_node(sbi, et, prev_ex); - *den = prev_ex; - } + if (en) + __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; en = next_ex; } - if (en) { - __try_update_largest_extent(et, en); + if (!en) + return NULL; + + __try_update_largest_extent(inode, et, en); + + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); et->cached_en = en; } + spin_unlock(&sbi->extent_lock); return en; } -static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, +static struct extent_node *__insert_extent_tree(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct rb_node **insert_p, struct rb_node *insert_parent) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct rb_node **p = &et->root.rb_node; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -388,8 +406,13 @@ do_insert: if (!en) return NULL; - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); + + /* update in global extent list */ + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); et->cached_en = en; + spin_unlock(&sbi->extent_lock); return en; } @@ -412,7 +435,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, write_lock(&et->lock); - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); return false; } @@ -454,7 +477,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, set_extent_info(&ei, end, end - dei.fofs + dei.blk, org_end - end); - en1 = __insert_extent_tree(sbi, et, &ei, + en1 = __insert_extent_tree(inode, et, &ei, NULL, NULL); next_en = en1; } else { @@ -475,9 +498,9 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, } if (parts) - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); else - __detach_extent_node(sbi, et, en); + __release_extent_node(sbi, et, en); /* * if original extent is split into zero or two parts, extent @@ -488,58 +511,28 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, insert_p = NULL; insert_parent = NULL; } - - /* update in global extent list */ - spin_lock(&sbi->extent_lock); - if (!parts && !list_empty(&en->list)) - list_del(&en->list); - if (en1) - list_add_tail(&en1->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); - - /* release extent node */ - if (!parts) - kmem_cache_free(extent_node_slab, en); - en = next_en; } /* 3. update extent in extent cache */ if (blkaddr) { - struct extent_node *den = NULL; set_extent_info(&ei, fofs, blkaddr, len); - en1 = __try_merge_extent_node(sbi, et, &ei, &den, - prev_en, next_en); - if (!en1) - en1 = __insert_extent_tree(sbi, et, &ei, + if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en)) + __insert_extent_tree(inode, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && prev.len < F2FS_MIN_EXTENT_LEN && et->largest.len < F2FS_MIN_EXTENT_LEN) { - et->largest.len = 0; - set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); + __drop_largest_extent(inode, 0, UINT_MAX); + set_inode_flag(inode, FI_NO_EXTENT); } - - spin_lock(&sbi->extent_lock); - if (en1) { - if (list_empty(&en1->list)) - list_add_tail(&en1->list, &sbi->extent_list); - else - list_move_tail(&en1->list, &sbi->extent_list); - } - if (den && !list_empty(&den->list)) - list_del(&den->list); - spin_unlock(&sbi->extent_lock); - - if (den) - kmem_cache_free(extent_node_slab, den); } - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) - __free_extent_tree(sbi, et, true); + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + __free_extent_tree(sbi, et); write_unlock(&et->lock); @@ -548,46 +541,42 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { - struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; - struct extent_node *en, *tmp; - unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_root *root = &sbi->extent_tree_root; - unsigned int found; + struct extent_tree *et, *next; + struct extent_node *en; unsigned int node_cnt = 0, tree_cnt = 0; int remained; if (!test_opt(sbi, EXTENT_CACHE)) return 0; + if (!atomic_read(&sbi->total_zombie_tree)) + goto free_node; + if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - if (!atomic_read(&et->refcount)) { - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); - - radix_tree_delete(root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; - - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } + list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et); + write_unlock(&et->lock); } + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + list_del_init(&et->list); + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&sbi->total_ext_tree); + atomic_dec(&sbi->total_zombie_tree); + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + cond_resched(); } up_write(&sbi->extent_tree_lock); +free_node: /* 2. remove LRU extent entries */ if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; @@ -595,34 +584,29 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) remained = nr_shrink - (node_cnt + tree_cnt); spin_lock(&sbi->extent_lock); - list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { - if (!remained--) + for (; remained > 0; remained--) { + if (list_empty(&sbi->extent_list)) break; + en = list_first_entry(&sbi->extent_list, + struct extent_node, list); + et = en->et; + if (!write_trylock(&et->lock)) { + /* refresh this extent node's position in extent list */ + list_move_tail(&en->list, &sbi->extent_list); + continue; + } + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + + __detach_extent_node(sbi, et, en); + + write_unlock(&et->lock); + node_cnt++; + spin_lock(&sbi->extent_lock); } spin_unlock(&sbi->extent_lock); - /* - * reset ino for searching victims from beginning of global extent tree. - */ - ino = F2FS_ROOT_INO(sbi); - - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); - - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } - } unlock_out: up_write(&sbi->extent_tree_lock); out: @@ -637,16 +621,29 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) struct extent_tree *et = F2FS_I(inode)->extent_tree; unsigned int node_cnt = 0; - if (!et) + if (!et || !atomic_read(&et->node_cnt)) return 0; write_lock(&et->lock); - node_cnt = __free_extent_tree(sbi, et, true); + node_cnt = __free_extent_tree(sbi, et); write_unlock(&et->lock); return node_cnt; } +void f2fs_drop_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + + set_inode_flag(inode, FI_NO_EXTENT); + + write_lock(&et->lock); + __free_extent_tree(sbi, et); + __drop_largest_extent(inode, 0, UINT_MAX); + write_unlock(&et->lock); +} + void f2fs_destroy_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -656,8 +653,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (!et) return; - if (inode->i_nlink && !is_bad_inode(inode) && et->count) { - atomic_dec(&et->refcount); + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + down_write(&sbi->extent_tree_lock); + list_add_tail(&et->list, &sbi->zombie_list); + atomic_inc(&sbi->total_zombie_tree); + up_write(&sbi->extent_tree_lock); return; } @@ -666,11 +667,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) /* delete extent tree entry in radix tree */ down_write(&sbi->extent_tree_lock); - atomic_dec(&et->refcount); - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; + atomic_dec(&sbi->total_ext_tree); up_write(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -689,20 +689,20 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, void f2fs_update_extent_cache(struct dnode_of_data *dn) { - struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs; + block_t blkaddr; if (!f2fs_may_extent_tree(dn->inode)) return; - f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + if (dn->data_blkaddr == NEW_ADDR) + blkaddr = NULL_ADDR; + else + blkaddr = dn->data_blkaddr; - - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + - dn->ofs_in_node; - - if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1)) - sync_inode_page(dn); + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); } void f2fs_update_extent_cache_range(struct dnode_of_data *dn, @@ -712,8 +712,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, if (!f2fs_may_extent_tree(dn->inode)) return; - if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len)) - sync_inode_page(dn); + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); } void init_extent_cache_info(struct f2fs_sb_info *sbi) @@ -722,7 +721,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) init_rwsem(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_tree, 0); + INIT_LIST_HEAD(&sbi->zombie_list); + atomic_set(&sbi->total_zombie_tree, 0); atomic_set(&sbi->total_ext_node, 0); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2871576fbca4..af293e84e5cd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -21,10 +21,12 @@ #include #include #include +#include +#include +#include #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) -#define f2fs_down_write(x, y) down_write_nest_lock(x, y) #else #define f2fs_bug_on(sbi, condition) \ do { \ @@ -33,7 +35,30 @@ set_sbi_flag(sbi, SBI_NEED_FSCK); \ } \ } while (0) -#define f2fs_down_write(x, y) down_write(x) +#endif + +#ifdef CONFIG_F2FS_FAULT_INJECTION +enum { + FAULT_KMALLOC, + FAULT_PAGE_ALLOC, + FAULT_ALLOC_NID, + FAULT_ORPHAN, + FAULT_BLOCK, + FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, + FAULT_IO, + FAULT_CHECKPOINT, + FAULT_MAX, +}; + +struct f2fs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; + +extern char *fault_name[FAULT_MAX]; +#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) #endif /* @@ -54,6 +79,10 @@ #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 +#define F2FS_MOUNT_DATA_FLUSH 0x00008000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 +#define F2FS_MOUNT_ADAPTIVE 0x00020000 +#define F2FS_MOUNT_LFS 0x00040000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -74,6 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_HMSMR 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -82,25 +112,30 @@ struct f2fs_mount_info { #define F2FS_CLEAR_FEATURE(sb, mask) \ F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) -#define CRCPOLY_LE 0xedb88320 - -static inline __u32 f2fs_crc32(void *buf, size_t len) +/** + * wq_has_sleeper - check if there are any waiting processes + * @wq: wait queue head + * + * Returns true if wq has waiting processes + * + * Please refer to the comment for waitqueue_active. + */ +static inline bool wq_has_sleeper(wait_queue_head_t *wq) { - unsigned char *p = (unsigned char *)buf; - __u32 crc = F2FS_SUPER_MAGIC; - int i; - - while (len--) { - crc ^= *p++; - for (i = 0; i < 8; i++) - crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); - } - return crc; + /* + * We need to be sure we are in sync with the + * add_wait_queue modifications to the wait queue. + * + * This memory barrier should be paired with one on the + * waiting side. + */ + smp_mb(); + return waitqueue_active(wq); } -static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size) +static inline void inode_nohighmem(struct inode *inode) { - return f2fs_crc32(buf, buf_size) == blk_crc; + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } /* @@ -119,12 +154,13 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 32 +#define DEF_BATCHED_TRIM_SECTIONS 2 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 5 /* 5 secs */ struct cp_control { int reason; @@ -158,13 +194,7 @@ struct ino_entry { nid_t ino; /* inode number */ }; -/* - * for the list of directory inodes or gc inodes. - * NOTE: there are two slab users for this structure, if we add/modify/delete - * fields in structure for one of slab users, it may affect fields or size of - * other one, in this condition, it's better to split both of slab and related - * data structure. - */ +/* for the list of inodes to be GCed */ struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ @@ -177,46 +207,52 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; +struct bio_entry { + struct list_head list; + struct bio *bio; + struct completion event; + int error; +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ block_t blkaddr; /* block address locating the last fsync */ block_t last_dentry; /* block address locating the last dentry */ - block_t last_inode; /* block address locating the last inode */ }; -#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) -#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) -#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne) -#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid) -#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) -#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) -#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) -#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) +#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) +#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) -static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) +static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { - int before = nats_in_cursum(rs); - rs->n_nats = cpu_to_le16(before + i); + int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } -static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) +static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { - int before = sits_in_cursum(rs); - rs->n_sits = cpu_to_le16(before + i); + int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } -static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, - int type) +static inline bool __has_cursum_space(struct f2fs_journal *journal, + int size, int type) { if (type == NAT_JOURNAL) - return size <= MAX_NAT_JENTRIES(sum); - return size <= MAX_SIT_JENTRIES(sum); + return size <= MAX_NAT_JENTRIES(journal); + return size <= MAX_SIT_JENTRIES(journal); } /* @@ -234,13 +270,13 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) +#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct f2fs_move_range) -#define F2FS_IOC_SET_ENCRYPTION_POLICY \ - _IOR('f', 19, struct f2fs_encryption_policy) -#define F2FS_IOC_GET_ENCRYPTION_PWSALT \ - _IOW('f', 20, __u8[16]) -#define F2FS_IOC_GET_ENCRYPTION_POLICY \ - _IOW('f', 21, struct f2fs_encryption_policy) +#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT /* * should be same as XFS_IOC_GOINGDOWN. @@ -256,33 +292,27 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, /* * ioctl commands in 32 bit emulation */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_defragment { + u64 start; + u64 len; +}; + +struct f2fs_move_range { + u32 dst_fd; /* destination fd */ + u64 pos_in; /* start position in src_fd */ + u64 pos_out; /* start position in dst_fd */ + u64 len; /* size to move */ +}; + /* * For INODE and NODE manager */ /* for directory operations */ -struct f2fs_str { - unsigned char *name; - u32 len; -}; - -struct f2fs_filename { - const struct qstr *usr_fname; - struct f2fs_str disk_name; - f2fs_hash_t hash; -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_str crypto_buf; -#endif -}; - -#define FSTR_INIT(n, l) { .name = n, .len = l } -#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) -#define fname_name(p) ((p)->disk_name.name) -#define fname_len(p) ((p)->disk_name.len) - struct f2fs_dentry_ptr { struct inode *inode; const void *bitmap; @@ -350,6 +380,7 @@ struct extent_node { struct rb_node rb_node; /* rb node located in rb-tree */ struct list_head list; /* node in global extent list of sbi */ struct extent_info ei; /* extent info */ + struct extent_tree *et; /* extent tree pointer */ }; struct extent_tree { @@ -357,9 +388,9 @@ struct extent_tree { struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ + struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ - atomic_t refcount; /* reference count of rb-tree */ - unsigned int count; /* # of extent node in rb-tree*/ + atomic_t node_cnt; /* # of extent node in rb-tree*/ }; /* @@ -378,6 +409,7 @@ struct f2fs_map_blocks { block_t m_lblk; unsigned int m_len; unsigned int m_flags; + pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ }; /* for flag in get_data_block */ @@ -385,6 +417,8 @@ struct f2fs_map_blocks { #define F2FS_GET_BLOCK_DIO 1 #define F2FS_GET_BLOCK_FIEMAP 2 #define F2FS_GET_BLOCK_BMAP 3 +#define F2FS_GET_BLOCK_PRE_DIO 4 +#define F2FS_GET_BLOCK_PRE_AIO 5 /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -406,15 +440,6 @@ struct f2fs_map_blocks { #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) -/* Encryption algorithms */ -#define F2FS_ENCRYPTION_MODE_INVALID 0 -#define F2FS_ENCRYPTION_MODE_AES_256_XTS 1 -#define F2FS_ENCRYPTION_MODE_AES_256_GCM 2 -#define F2FS_ENCRYPTION_MODE_AES_256_CBC 3 -#define F2FS_ENCRYPTION_MODE_AES_256_CTS 4 - -#include "f2fs_crypto.h" - #define DEF_DIR_LEVEL 0 struct f2fs_inode_info { @@ -429,30 +454,27 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - atomic_t dirty_pages; /* # of dirty pages */ + struct percpu_counter dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct inode_entry *dirty_dir; /* the pointer of dirty dir */ + loff_t last_disk_size; /* lastly written file size */ + struct list_head dirty_list; /* dirty list for dirs and files */ + struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ - struct extent_tree *extent_tree; /* cached extent_tree entry */ - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - /* Encryption params */ - struct f2fs_crypt_info *i_crypt_info; -#endif + struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ }; static inline void get_extent_info(struct extent_info *ext, - struct f2fs_extent i_ext) + struct f2fs_extent *i_ext) { - ext->fofs = le32_to_cpu(i_ext.fofs); - ext->blk = le32_to_cpu(i_ext.blk); - ext->len = le32_to_cpu(i_ext.len); + ext->fofs = le32_to_cpu(i_ext->fofs); + ext->blk = le32_to_cpu(i_ext->blk); + ext->len = le32_to_cpu(i_ext->len); } static inline void set_raw_extent(struct extent_info *ext, @@ -497,11 +519,14 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) +extern void f2fs_mark_inode_dirty_sync(struct inode *); +static inline void __try_update_largest_extent(struct inode *inode, + struct extent_tree *et, struct extent_node *en) { - if (en->ei.len > et->largest.len) + if (en->ei.len > et->largest.len) { et->largest = en->ei; + f2fs_mark_inode_dirty_sync(inode); + } } struct f2fs_nm_info { @@ -511,6 +536,7 @@ struct f2fs_nm_info { nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ + unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ @@ -544,6 +570,9 @@ struct dnode_of_data { nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ bool inode_page_locked; /* inode page is locked or not */ + bool node_changed; /* is node block changed */ + char cur_level; /* level of hole node page */ + char max_level; /* level of current page located */ block_t data_blkaddr; /* block address of the node block */ }; @@ -594,6 +623,7 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + atomic_t submit_flush; /* # of issued flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -618,6 +648,7 @@ struct f2fs_sm_info { /* for small discard management */ struct list_head discard_list; /* 4KB discard list */ + struct list_head wait_list; /* linked with issued discard bio */ int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ @@ -645,11 +676,12 @@ struct f2fs_sm_info { * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ enum count_type { - F2FS_WRITEBACK, F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, + F2FS_DIRTY_IMETA, NR_COUNT_TYPE, }; @@ -673,6 +705,7 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_REVOKE, IPU, OPU, }; @@ -681,7 +714,8 @@ struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ - block_t blk_addr; /* block address to be written */ + block_t new_blkaddr; /* new block address to be written */ + block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ }; @@ -695,6 +729,13 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + DIRTY_META, /* for all dirtied inode metadata */ + NR_INODE_TYPE, +}; + /* for inner inode cache management */ struct inode_management { struct radix_tree_root ino_root; /* ino entry array */ @@ -709,15 +750,31 @@ enum { SBI_IS_CLOSE, /* specify unmounting */ SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ + SBI_NEED_SB_WRITE, /* need to recover superblock */ + SBI_NEED_CP, /* need to checkpoint */ }; +enum { + CP_TIME, + REQ_TIME, + MAX_TIME, +}; + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define F2FS_KEY_DESC_PREFIX "f2fs:" +#define F2FS_KEY_DESC_PREFIX_SIZE 5 +#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ - struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - int s_flag; /* flags for sbi */ + int valid_super_block; /* valid super block no */ + unsigned long s_flag; /* flags for sbi */ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; + u8 key_prefix_size; +#endif /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -728,32 +785,36 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ - struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; - long cp_expires, cp_interval; /* next expected periodic cp */ + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ - /* for directory inode management */ - struct list_head dir_inode_list; /* dir inode list */ - spinlock_t dir_inode_lock; /* for dir inode list lock */ + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ - int total_ext_tree; /* extent tree count */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ atomic_t total_ext_node; /* extent info count */ /* basic filesystem units */ @@ -770,17 +831,24 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - unsigned int total_valid_inode_count; /* valid inode count */ + loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ - block_t alloc_valid_block_count; /* # of allocated blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ + atomic_t nr_wb_bios; /* # of writeback bios */ + + /* # of pages, see count_type */ + struct percpu_counter nr_pages[NR_COUNT_TYPE]; + /* # of allocated blocks */ + struct percpu_counter alloc_valid_block_count; + + /* valid inode count */ + struct percpu_counter total_valid_inode_count; struct f2fs_mount_info mount_opt; /* mount options */ @@ -809,7 +877,7 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ - unsigned int n_dirty_dirs; /* # of dir inodes */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ @@ -822,11 +890,102 @@ struct f2fs_sb_info { struct list_head s_list; struct mutex umount_mutex; unsigned int shrinker_run_no; + + /* For write statistics */ + u64 sectors_written_start; + u64 kbytes_written; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* For fault injection */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; +#endif }; +#ifdef CONFIG_F2FS_FAULT_INJECTION +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_fault_info *ffi = &sbi->fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + printk("%sF2FS-fs : inject %s in %pF\n", + KERN_INFO, + fault_name[type], + __builtin_return_address(0)); + return true; + } + return false; +} +#endif + +/* For write statistics. Suppose sector size is 512 bytes, + * and the return value is in kbytes. s is of struct f2fs_sb_info. + */ +#define BD_PART_WRITTEN(s) \ +(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ + s->sectors_written_start) >> 1) + +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + sbi->last_time[type] = jiffies; +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + struct timespec ts = {sbi->interval_time[type], 0}; + unsigned long interval = timespec_to_jiffies(&ts); + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline bool is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) + return 0; + + return f2fs_time_over(sbi, REQ_TIME); +} + /* * Inline functions */ +static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, + unsigned int length) +{ + SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); + u32 *ctx = (u32 *)shash_desc_ctx(shash); + int err; + + shash->tfm = sbi->s_chksum_driver; + shash->flags = 0; + *ctx = F2FS_SUPER_MAGIC; + + err = crypto_shash_update(shash, address, length); + BUG_ON(err); + + return *ctx; +} + +static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, + void *buf, size_t buf_size) +{ + return f2fs_crc32(sbi, buf, buf_size) == blk_crc; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -909,17 +1068,17 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) { - return sbi->s_flag & (0x01 << type); + return test_bit(type, &sbi->s_flag); } static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag |= (0x01 << type); + set_bit(type, &sbi->s_flag); } static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag &= ~(0x01 << type); + clear_bit(type, &sbi->s_flag); } static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) @@ -927,26 +1086,57 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } -static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return ckpt_flags & f; } -static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); +} + +static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags |= f; cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + spin_lock(&sbi->cp_lock); + __set_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags &= (~f); cp->ckpt_flags = cpu_to_le32(ckpt_flags); } +static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + spin_lock(&sbi->cp_lock); + __clear_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q); +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -959,7 +1149,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex); + down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) @@ -985,8 +1175,8 @@ static inline bool __remain_node_summaries(int reason) static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) { - return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG)); + return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || + is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); } /* @@ -1019,22 +1209,37 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } +static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, - struct inode *inode, blkcnt_t count) + struct inode *inode, blkcnt_t *count) { - block_t valid_block_count; + blkcnt_t diff; + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_BLOCK)) + return false; +#endif + /* + * let's increase this in prior to actual block count change in order + * for f2fs_sync_file to avoid data races when deciding checkpoint. + */ + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); spin_lock(&sbi->stat_lock); - valid_block_count = - sbi->total_valid_block_count + (block_t)count; - if (unlikely(valid_block_count > sbi->user_block_count)) { - spin_unlock(&sbi->stat_lock); - return false; + sbi->total_valid_block_count += (block_t)(*count); + if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { + diff = sbi->total_valid_block_count - sbi->user_block_count; + *count -= diff; + sbi->total_valid_block_count = sbi->user_block_count; + if (!*count) { + spin_unlock(&sbi->stat_lock); + percpu_counter_sub(&sbi->alloc_valid_block_count, diff); + return false; + } } - inode->i_blocks += count; - sbi->total_valid_block_count = valid_block_count; - sbi->alloc_valid_block_count += (block_t)count; spin_unlock(&sbi->stat_lock); + + f2fs_i_blocks_write(inode, *count, true); return true; } @@ -1045,27 +1250,31 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); f2fs_bug_on(sbi, inode->i_blocks < count); - inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); + f2fs_i_blocks_write(inode, count, false); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_inc(&sbi->nr_pages[count_type]); + percpu_counter_inc(&sbi->nr_pages[count_type]); + + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + return; + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) { - atomic_inc(&F2FS_I(inode)->dirty_pages); - if (S_ISDIR(inode->i_mode)) - inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_dec(&sbi->nr_pages[count_type]); + percpu_counter_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1074,28 +1283,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - atomic_dec(&F2FS_I(inode)->dirty_pages); - - if (S_ISDIR(inode->i_mode)) - dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } -static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return atomic_read(&sbi->nr_pages[count_type]); + return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); } -static inline int get_dirty_pages(struct inode *inode) +static inline s64 get_dirty_pages(struct inode *inode) { - return atomic_read(&F2FS_I(inode)->dirty_pages); + return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); - return ((get_pages(sbi, block_type) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >> + sbi->log_blocks_per_seg; + + return segs / sbi->segs_per_sec; } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) @@ -1103,6 +1312,11 @@ static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) return sbi->total_valid_block_count; } +static inline block_t discard_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->discard_blks; +} + static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1182,13 +1396,13 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, } if (inode) - inode->i_blocks++; + f2fs_i_blocks_write(inode, 1, true); - sbi->alloc_valid_block_count++; sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->alloc_valid_block_count); return true; } @@ -1201,7 +1415,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, !sbi->total_valid_node_count); f2fs_bug_on(sbi, !inode->i_blocks); - inode->i_blocks--; + f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; @@ -1215,28 +1429,30 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count); - sbi->total_valid_inode_count++; - spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->total_valid_inode_count); } static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_inode_count); - sbi->total_valid_inode_count--; - spin_unlock(&sbi->stat_lock); + percpu_counter_dec(&sbi->total_valid_inode_count); } -static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) { - return sbi->total_valid_inode_count; + return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct page *page = find_lock_page(mapping, index); + if (page) + return page; + + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + return NULL; +#endif if (!for_write) return grab_cache_page(mapping, index); return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -1261,7 +1477,7 @@ static inline void f2fs_put_page(struct page *page, int unlock) f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); unlock_page(page); } - page_cache_release(page); + put_page(page); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) @@ -1396,13 +1612,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) enum { FI_NEW_INODE, /* indicate newly allocated inode */ FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ FI_FREE_NID, /* free allocated nide */ - FI_UPDATE_DIR, /* should update inode block for consistency */ - FI_DELAY_IPUT, /* used for the recovery */ FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ @@ -1416,71 +1631,152 @@ enum { FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ }; -static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline void __mark_inode_dirty_flag(struct inode *inode, + int flag, bool set) { - if (!test_bit(flag, &fi->flags)) - set_bit(flag, &fi->flags); + switch (flag) { + case FI_INLINE_XATTR: + case FI_INLINE_DATA: + case FI_INLINE_DENTRY: + if (set) + return; + case FI_DATA_EXIST: + case FI_INLINE_DOTS: + f2fs_mark_inode_dirty_sync(inode); + } } -static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) +static inline void set_inode_flag(struct inode *inode, int flag) { - return test_bit(flag, &fi->flags); + if (!test_bit(flag, &F2FS_I(inode)->flags)) + set_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, true); } -static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline int is_inode_flag_set(struct inode *inode, int flag) { - if (test_bit(flag, &fi->flags)) - clear_bit(flag, &fi->flags); + return test_bit(flag, &F2FS_I(inode)->flags); } -static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) +static inline void clear_inode_flag(struct inode *inode, int flag) { - fi->i_acl_mode = mode; - set_inode_flag(fi, FI_ACL_MODE); + if (test_bit(flag, &F2FS_I(inode)->flags)) + clear_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, false); } -static inline void get_inline_info(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void set_acl_inode(struct inode *inode, umode_t mode) { + F2FS_I(inode)->i_acl_mode = mode; + set_inode_flag(inode, FI_ACL_MODE); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_links_write(struct inode *inode, bool inc) +{ + if (inc) + inc_nlink(inode); + else + drop_nlink(inode); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_blocks_write(struct inode *inode, + blkcnt_t diff, bool add) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + inode->i_blocks = add ? inode->i_blocks + diff : + inode->i_blocks - diff; + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + if (i_size_read(inode) == i_size) + return; + + i_size_write(inode, i_size); + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline bool f2fs_skip_inode_update(struct inode *inode) +{ + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); +} + +static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) +{ + F2FS_I(inode)->i_current_depth = depth; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) +{ + F2FS_I(inode)->i_xattr_nid = xnid; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) +{ + F2FS_I(inode)->i_pino = pino; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + if (ri->i_inline & F2FS_INLINE_XATTR) - set_inode_flag(fi, FI_INLINE_XATTR); + set_bit(FI_INLINE_XATTR, &fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) - set_inode_flag(fi, FI_INLINE_DATA); + set_bit(FI_INLINE_DATA, &fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) - set_inode_flag(fi, FI_INLINE_DENTRY); + set_bit(FI_INLINE_DENTRY, &fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) - set_inode_flag(fi, FI_DATA_EXIST); + set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) - set_inode_flag(fi, FI_INLINE_DOTS); + set_bit(FI_INLINE_DOTS, &fi->flags); } -static inline void set_raw_inline(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) { ri->i_inline = 0; - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (is_inode_flag_set(inode, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; - if (is_inode_flag_set(fi, FI_INLINE_DATA)) + if (is_inode_flag_set(inode, FI_INLINE_DATA)) ri->i_inline |= F2FS_INLINE_DATA; - if (is_inode_flag_set(fi, FI_INLINE_DENTRY)) + if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) ri->i_inline |= F2FS_INLINE_DENTRY; - if (is_inode_flag_set(fi, FI_DATA_EXIST)) + if (is_inode_flag_set(inode, FI_DATA_EXIST)) ri->i_inline |= F2FS_DATA_EXIST; - if (is_inode_flag_set(fi, FI_INLINE_DOTS)) + if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; } static inline int f2fs_has_inline_xattr(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); + return is_inode_flag_set(inode, FI_INLINE_XATTR); } -static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) +static inline unsigned int addrs_per_inode(struct inode *inode) { - if (f2fs_has_inline_xattr(&fi->vfs_inode)) + if (f2fs_has_inline_xattr(inode)) return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; return DEF_ADDRS_PER_INODE; } @@ -1502,43 +1798,43 @@ static inline int inline_xattr_size(struct inode *inode) static inline int f2fs_has_inline_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); + return is_inode_flag_set(inode, FI_INLINE_DATA); } static inline void f2fs_clear_inline_inode(struct inode *inode) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + clear_inode_flag(inode, FI_INLINE_DATA); + clear_inode_flag(inode, FI_DATA_EXIST); } static inline int f2fs_exist_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); + return is_inode_flag_set(inode, FI_DATA_EXIST); } static inline int f2fs_has_inline_dots(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS); + return is_inode_flag_set(inode, FI_INLINE_DOTS); } static inline bool f2fs_is_atomic_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); + return is_inode_flag_set(inode, FI_ATOMIC_FILE); } static inline bool f2fs_is_volatile_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); + return is_inode_flag_set(inode, FI_VOLATILE_FILE); } static inline bool f2fs_is_first_block_written(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); } static inline bool f2fs_is_drop_cache(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); + return is_inode_flag_set(inode, FI_DROP_CACHE); } static inline void *inline_data_addr(struct page *page) @@ -1549,7 +1845,7 @@ static inline void *inline_data_addr(struct page *page) static inline int f2fs_has_inline_dentry(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); + return is_inode_flag_set(inode, FI_INLINE_DENTRY); } static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) @@ -1566,11 +1862,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; + f2fs_mark_inode_dirty_sync(inode); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; + f2fs_mark_inode_dirty_sync(inode); } static inline int f2fs_readonly(struct super_block *sb) @@ -1580,13 +1878,7 @@ static inline int f2fs_readonly(struct super_block *sb) static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) { - return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); -} - -static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) -{ - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; + return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } static inline bool is_dot_dotdot(const struct qstr *str) @@ -1602,13 +1894,21 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { - mode_t mode = inode->i_mode; - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || - is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + is_inode_flag_set(inode, FI_NO_EXTENT)) return false; - return S_ISREG(mode); + return S_ISREG(inode->i_mode); +} + +static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_KMALLOC)) + return NULL; +#endif + return kmalloc(size, flags); } static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) @@ -1632,14 +1932,14 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) } #define get_inode_mode(i) \ - ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) /* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \ - ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \ - (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi)) +#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ + ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ + (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ + ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) /* * file.c @@ -1647,7 +1947,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *, bool); +int f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); @@ -1660,9 +1960,10 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); */ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); +struct inode *f2fs_iget_retry(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); -void update_inode(struct inode *, struct page *); -void update_inode_page(struct inode *); +int update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); void handle_failed_inode(struct inode *); @@ -1675,29 +1976,34 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; void set_de_type(struct f2fs_dir_entry *, umode_t); - -struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *, +unsigned char get_de_type(struct f2fs_dir_entry *); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct f2fs_str *); + unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, struct page *); + const struct qstr *, const struct qstr *, struct page *); void update_parent_metadata(struct inode *, struct inode *, unsigned int); int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *, struct page *); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, +void f2fs_drop_nlink(struct inode *, struct inode *); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, + struct page **); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, struct qstr *); +ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, const struct qstr *, f2fs_hash_t , unsigned int); +int f2fs_add_regular_entry(struct inode *, const struct qstr *, + const struct qstr *, struct inode *, nid_t, umode_t); +int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, + nid_t, umode_t); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, @@ -1714,16 +2020,18 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ +int f2fs_inode_dirtied(struct inode *); +void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); extern __printf(3, 4) void f2fs_msg(struct super_block *, const char *, const char *, ...); +int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, - struct f2fs_filename *fname); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *); /* * node.c @@ -1736,6 +2044,7 @@ int need_dentry_mark(struct f2fs_sb_info *, nid_t); bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); bool need_inode_block_update(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); +pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); @@ -1746,8 +2055,11 @@ struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); -void sync_inode_page(struct dnode_of_data *); -int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +void move_node_page(struct page *, int); +int fsync_node_pages(struct f2fs_sb_info *, struct inode *, + struct writeback_control *, bool); +int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); +void build_free_nids(struct f2fs_sb_info *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); @@ -1767,8 +2079,9 @@ void destroy_node_manager_caches(void); * segment.c */ void register_inmem_page(struct inode *, struct page *); -int commit_inmem_pages(struct inode *, bool); -void f2fs_balance_fs(struct f2fs_sb_info *); +void drop_inmem_pages(struct inode *); +int commit_inmem_pages(struct inode *); +void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); @@ -1778,7 +2091,6 @@ bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); -bool discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); @@ -1788,16 +2100,17 @@ void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(unsigned int, struct f2fs_io_info *); void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); void rewrite_data_page(struct f2fs_io_info *); +void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, + block_t, block_t, bool, bool); void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool); + block_t, block_t, unsigned char, bool, bool); void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type); +void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_summary_block *, - int, unsigned int, int); +int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); int build_segment_manager(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); @@ -1807,6 +2120,7 @@ void destroy_segment_manager_caches(void); /* * checkpoint.c */ +void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); @@ -1814,21 +2128,21 @@ bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void release_dirty_inode(struct f2fs_sb_info *); +void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void release_ino_entry(struct f2fs_sb_info *, bool); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); +int f2fs_sync_inode_meta(struct f2fs_sb_info *); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct f2fs_sb_info *, nid_t); +void add_orphan_inode(struct inode *); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); -void add_dirty_dir_inode(struct inode *); -void remove_dirty_dir_inode(struct inode *); -void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); +void remove_dirty_inode(struct inode *); +int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); +int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1837,34 +2151,46 @@ void destroy_checkpoint_caches(void); * data.c */ void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, + struct page *, nid_t, enum page_type, int); +void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); +void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); +int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); +ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); +void f2fs_set_page_dirty_nobuffers(struct page *); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); +#ifdef CONFIG_MIGRATION +int f2fs_migrate_page(struct address_space *, struct page *, struct page *, + enum migrate_mode); +#endif /* * gc.c */ int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); +block_t start_bidx_of_node(unsigned int, struct inode *); int f2fs_gc(struct f2fs_sb_info *, bool); void build_gc_manager(struct f2fs_sb_info *); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *, bool); bool space_for_roll_forward(struct f2fs_sb_info *); /* @@ -1878,18 +2204,20 @@ struct f2fs_stat_info { int main_area_segs, main_area_sections, main_area_zones; unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; - int ext_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; + int ext_tree, zombie_tree, ext_node; + s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + s64 inmem_pages; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inmem_pages, wb_pages; - int inline_xattr, inline_inode, inline_dir; - unsigned int valid_count, valid_node_count, valid_inode_count; + int bg_gc, wb_bios; + int inline_xattr, inline_inode, inline_dir, orphans; + unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count, cp_count; + int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; @@ -1910,10 +2238,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } #define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) -#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) -#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) #define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) @@ -1988,14 +2317,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void __init f2fs_create_root_stats(void); +int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) +#define stat_inc_bg_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) -#define stat_inc_dirty_dir(sbi) -#define stat_dec_dirty_dir(sbi) +#define stat_inc_dirty_inode(sbi, type) +#define stat_dec_dirty_inode(sbi, type) #define stat_inc_total_hit(sb) #define stat_inc_rbtree_node_hit(sb) #define stat_inc_largest_node_hit(sbi) @@ -2016,7 +2346,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void __init f2fs_create_root_stats(void) { } +static inline int __init f2fs_create_root_stats(void) { return 0; } static inline void f2fs_destroy_root_stats(void) { } #endif @@ -2045,16 +2375,15 @@ int f2fs_convert_inline_inode(struct inode *); int f2fs_write_inline_data(struct inode *, struct page *); bool recover_inline_data(struct inode *, struct page *); struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct f2fs_filename *, struct page **); -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); + struct fscrypt_name *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, - nid_t, umode_t); +int f2fs_add_inline_entry(struct inode *, const struct qstr *, + const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); bool f2fs_empty_inline_dir(struct inode *); int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct f2fs_str *); + struct fscrypt_str *); int f2fs_inline_data_fiemap(struct inode *, struct fiemap_extent_info *, __u64, __u64); @@ -2070,8 +2399,8 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *); * extent_cache.c */ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_drop_largest_extent(struct inode *, pgoff_t); -void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +void f2fs_drop_extent_tree(struct inode *); unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); @@ -2085,13 +2414,9 @@ void destroy_extent_cache(void); /* * crypto support */ -static inline int f2fs_encrypted_inode(struct inode *inode) +static inline bool f2fs_encrypted_inode(struct inode *inode) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION return file_is_encrypt(inode); -#else - return 0; -#endif } static inline void f2fs_set_encrypted_inode(struct inode *inode) @@ -2103,26 +2428,38 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) static inline bool f2fs_bio_encrypted(struct bio *bio) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - return unlikely(bio->bi_private != NULL); -#else - return false; -#endif + return bio->bi_private != NULL; } static inline int f2fs_sb_has_crypto(struct super_block *sb) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); -#else - return 0; -#endif +} + +static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); +} + +static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) +{ + clear_opt(sbi, ADAPTIVE); + clear_opt(sbi, LFS); + + switch (mt) { + case F2FS_MOUNT_ADAPTIVE: + set_opt(sbi, ADAPTIVE); + break; + case F2FS_MOUNT_LFS: + set_opt(sbi, LFS); + break; + } } static inline bool f2fs_may_encrypt(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #else @@ -2130,74 +2467,28 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -/* crypto_policy.c */ -int f2fs_is_child_context_consistent_with_parent(struct inode *, - struct inode *); -int f2fs_inherit_context(struct inode *, struct inode *, struct page *); -int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *); -int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *); - -/* crypt.c */ -extern struct kmem_cache *f2fs_crypt_info_cachep; -bool f2fs_valid_contents_enc_mode(uint32_t); -uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t); -struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *); -void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *); -struct page *f2fs_encrypt(struct inode *, struct page *); -int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *); -int f2fs_decrypt_one(struct inode *, struct page *); -void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *); - -/* crypto_key.c */ -void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *); - -/* crypto_fname.c */ -bool f2fs_valid_filenames_enc_mode(uint32_t); -u32 f2fs_fname_crypto_round_up(u32, u32); -int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *); -int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *, - const struct f2fs_str *, struct f2fs_str *); -int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *, - struct f2fs_str *); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION -void f2fs_restore_and_release_control_page(struct page **); -void f2fs_restore_control_page(struct page *); - -int __init f2fs_init_crypto(void); -int f2fs_crypto_initialize(void); -void f2fs_exit_crypto(void); - -int f2fs_has_encryption_key(struct inode *); - -int f2fs_get_encryption_info(struct inode *inode); - -void f2fs_fname_crypto_free_buffer(struct f2fs_str *); -int f2fs_fname_setup_filename(struct inode *, const struct qstr *, - int lookup, struct f2fs_filename *); -void f2fs_fname_free_filename(struct f2fs_filename *); -#else -static inline void f2fs_restore_and_release_control_page(struct page **p) { } -static inline void f2fs_restore_control_page(struct page *p) { } - -static inline int __init f2fs_init_crypto(void) { return 0; } -static inline void f2fs_exit_crypto(void) { } - -static inline int f2fs_has_encryption_key(struct inode *i) { return 0; } -static inline int f2fs_get_encryption_info(struct inode *i) { return 0; } -static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { } - -static inline int f2fs_fname_setup_filename(struct inode *dir, - const struct qstr *iname, - int lookup, struct f2fs_filename *fname) -{ - memset(fname, 0, sizeof(struct f2fs_filename)); - fname->usr_fname = iname; - fname->disk_name.name = (unsigned char *)iname->name; - fname->disk_name.len = iname->len; - return 0; -} - -static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { } +#ifndef CONFIG_F2FS_FS_ENCRYPTION +#define fscrypt_set_d_op(i) +#define fscrypt_get_ctx fscrypt_notsupp_get_ctx +#define fscrypt_release_ctx fscrypt_notsupp_release_ctx +#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page +#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page +#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages +#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page +#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page +#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range +#define fscrypt_process_policy fscrypt_notsupp_process_policy +#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context +#define fscrypt_inherit_context fscrypt_notsupp_inherit_context +#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info +#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info +#define fscrypt_setup_filename fscrypt_notsupp_setup_filename +#define fscrypt_free_filename fscrypt_notsupp_free_filename +#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size +#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer +#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer +#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr +#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk #endif #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4b449d263333..c6e33258fabf 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" @@ -40,8 +42,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; - f2fs_balance_fs(sbi); - sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -57,6 +57,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + file_update_time(vma->vm_file); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || @@ -74,19 +76,20 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, goto mapped; /* page is wholly or partially inside EOF */ - if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) > + if (((loff_t)(page->index + 1) << PAGE_SHIFT) > i_size_read(inode)) { unsigned offset; - offset = i_size_read(inode) & ~PAGE_CACHE_MASK; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + offset = i_size_read(inode) & ~PAGE_MASK; + zero_user_segment(page, offset, PAGE_SIZE); } set_page_dirty(page); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) @@ -96,6 +99,7 @@ mapped: clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); + f2fs_update_time(sbi, REQ_TIME); return block_page_mkwrite_return(err); } @@ -132,7 +136,7 @@ static inline bool need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino)) + else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) need_cp = true; else if (file_wrong_pino(inode)) need_cp = true; @@ -170,21 +174,16 @@ static void try_to_fix_pino(struct inode *inode) fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { - fi->i_pino = pino; + f2fs_i_pino_write(inode, pino); file_got_pino(inode); - up_write(&fi->i_sem); - - mark_inode_dirty_sync(inode); - f2fs_write_inode(inode, NULL); - } else { - up_write(&fi->i_sem); } + up_write(&fi->i_sem); } -int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, + int datasync, bool atomic) { struct inode *inode = file->f_mapping->host; - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; @@ -201,10 +200,10 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ - if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) - set_inode_flag(fi, FI_NEED_IPU); + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + set_inode_flag(inode, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - clear_inode_flag(fi, FI_NEED_IPU); + clear_inode_flag(inode, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -212,7 +211,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* if the inode is dirty, let's recover all the time */ - if (!datasync) { + if (!datasync && !f2fs_skip_inode_update(inode)) { f2fs_write_inode(inode, NULL); goto go_write; } @@ -220,29 +219,26 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * if there is no written data, don't waste time to write recovery info. */ - if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && !exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; - if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; } go_write: - /* guarantee free sections for fsync */ - f2fs_balance_fs(sbi); - /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - down_read(&fi->i_sem); + down_read(&F2FS_I(inode)->i_sem); need_cp = need_do_checkpoint(inode); - up_read(&fi->i_sem); + up_read(&F2FS_I(inode)->i_sem); if (need_cp) { /* all the dirty node pages should be flushed for POR */ @@ -253,19 +249,23 @@ go_write: * will be used only for fsynced inodes after checkpoint. */ try_to_fix_pino(inode); - clear_inode_flag(fi, FI_APPEND_WRITE); - clear_inode_flag(fi, FI_UPDATE_WRITE); + clear_inode_flag(inode, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); goto out; } sync_nodes: - sync_node_pages(sbi, ino, &wbc); - - /* if cp_error was enabled, we should avoid infinite loop */ - if (unlikely(f2fs_cp_error(sbi))) + ret = fsync_node_pages(sbi, inode, &wbc, atomic); + if (ret) goto out; + /* if cp_error was enabled, we should avoid infinite loop */ + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto out; + } + if (need_inode_block_update(sbi, ino)) { - mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -275,18 +275,24 @@ sync_nodes: goto out; /* once recovery info is written, don't need to tack this */ - remove_dirty_inode(sbi, ino, APPEND_INO); - clear_inode_flag(fi, FI_APPEND_WRITE); + remove_ino_entry(sbi, ino, APPEND_INO); + clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - remove_dirty_inode(sbi, ino, UPDATE_INO); - clear_inode_flag(fi, FI_UPDATE_WRITE); + remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); + f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); f2fs_trace_ios(NULL, 1); return ret; } +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + return f2fs_do_sync_file(file, start, end, datasync, false); +} + static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { @@ -300,7 +306,7 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, pagevec_init(&pvec, 0); nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; + pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX; pagevec_release(&pvec); return pgofs; } @@ -332,7 +338,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) @@ -345,32 +351,31 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto found; } - pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); + pgofs = (pgoff_t)(offset >> PAGE_SHIFT); dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { /* direct node does not exists */ if (whence == SEEK_DATA) { - pgofs = PGOFS_OF_NEXT_DNODE(pgofs, - F2FS_I(inode)); + pgofs = get_next_page_offset(&dn, pgofs); continue; } else { goto found; } } - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; dn.ofs_in_node++, pgofs++, - data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); @@ -387,10 +392,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -418,19 +423,20 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + int err; if (f2fs_encrypted_inode(inode)) { - int err = f2fs_get_encryption_info(inode); + err = fscrypt_get_encryption_info(inode); if (err) return 0; + if (!f2fs_encrypted_inode(inode)) + return -ENOKEY; } /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; @@ -440,12 +446,22 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { int ret = generic_file_open(inode, filp); + struct dentry *dir; if (!ret && f2fs_encrypted_inode(inode)) { - ret = f2fs_get_encryption_info(inode); + ret = fscrypt_get_encryption_info(inode); if (ret) - ret = -EACCES; + return -EACCES; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; } + dir = dget_parent(file_dentry(filp)); + if (f2fs_encrypted_inode(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + dput(dir); + return -EPERM; + } + dput(dir); return ret; } @@ -468,8 +484,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) set_data_blkaddr(dn); invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) - clear_inode_flag(F2FS_I(dn->inode), - FI_FIRST_BLOCK_WRITTEN); + clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); nr_free++; } @@ -480,14 +495,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) * we will invalidate all blkaddr in the whole range. */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), - F2FS_I(dn->inode)) + ofs; + dn->inode) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); - set_page_dirty(dn->node_page); - sync_inode_page(dn); } dn->ofs_in_node = ofs; + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); return nr_free; @@ -501,8 +515,8 @@ void truncate_data_blocks(struct dnode_of_data *dn) static int truncate_partial_data_page(struct inode *inode, u64 from, bool cache_only) { - unsigned offset = from & (PAGE_CACHE_SIZE - 1); - pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_SIZE - 1); + pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; struct page *page; @@ -510,7 +524,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; if (cache_only) { - page = f2fs_grab_cache_page(mapping, index, false); + page = find_lock_page(mapping, index); if (page && PageUptodate(page)) goto truncate_out; f2fs_put_page(page, 1); @@ -521,9 +535,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, if (IS_ERR(page)) return 0; truncate_out: - f2fs_wait_on_page_writeback(page, DATA); - zero_user(page, offset, PAGE_CACHE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + f2fs_wait_on_page_writeback(page, DATA, true); + zero_user(page, offset, PAGE_SIZE - offset); + if (!cache_only || !f2fs_encrypted_inode(inode) || + !S_ISREG(inode->i_mode)) set_page_dirty(page); f2fs_put_page(page, 1); return 0; @@ -543,6 +558,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + if (free_from >= sbi->max_file_blocks) + goto free_partial; + if (lock) f2fs_lock_op(sbi); @@ -561,14 +579,14 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } set_new_dnode(&dn, inode, ipage, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; goto out; } - count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + count = ADDRS_PER_PAGE(dn.node_page, inode); count -= dn.ofs_in_node; f2fs_bug_on(sbi, count < 0); @@ -584,7 +602,7 @@ free_next: out: if (lock) f2fs_unlock_op(sbi); - +free_partial: /* lastly zero out the first data page */ if (!err) err = truncate_partial_data_page(inode, from, truncate_page); @@ -593,7 +611,7 @@ out: return err; } -int f2fs_truncate(struct inode *inode, bool lock) +int f2fs_truncate(struct inode *inode) { int err; @@ -604,18 +622,18 @@ int f2fs_truncate(struct inode *inode, bool lock) trace_f2fs_truncate(inode); /* we should check inline_data size */ - if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { + if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; } - err = truncate_blocks(inode, i_size_read(inode), lock); + err = truncate_blocks(inode, i_size_read(inode), true); if (err) return err; inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -631,7 +649,6 @@ int f2fs_getattr(struct vfsmount *mnt, #ifdef CONFIG_F2FS_FS_POSIX_ACL static void __setattr_copy(struct inode *inode, const struct iattr *attr) { - struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) @@ -652,7 +669,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; - set_acl_inode(fi, mode); + set_acl_inode(inode, mode); } } #else @@ -662,7 +679,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); - struct f2fs_inode_info *fi = F2FS_I(inode); int err; err = inode_change_ok(inode, attr); @@ -671,21 +687,28 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { if (f2fs_encrypted_inode(inode) && - f2fs_get_encryption_info(inode)) + fscrypt_get_encryption_info(inode)) return -EACCES; if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode)); + f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + + /* should convert inline inode here */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } @@ -694,13 +717,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, get_inode_mode(inode)); - if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; - clear_inode_flag(fi, FI_ACL_MODE); + if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + clear_inode_flag(inode, FI_ACL_MODE); } } - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return err; } @@ -727,7 +750,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (!len) return 0; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); page = get_new_data_page(inode, NULL, index, false); @@ -736,7 +759,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (IS_ERR(page)) return PTR_ERR(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); @@ -761,7 +784,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return err; } - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); @@ -778,19 +801,17 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; - int ret = 0; + int ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { ret = fill_zero(inode, pg_start, off_start, @@ -800,7 +821,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + PAGE_SIZE - off_start); if (ret) return ret; } @@ -815,10 +836,10 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); - blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; - blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; + blk_start = (loff_t)pg_start << PAGE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -831,83 +852,199 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int __exchange_data_block(struct inode *inode, pgoff_t src, - pgoff_t dst, bool full) +static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, pgoff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - block_t new_addr; - bool do_replace = false; - int ret; + int ret, done, i; +next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA); + ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { return ret; } else if (ret == -ENOENT) { - new_addr = NULL_ADDR; - } else { - new_addr = dn.data_blkaddr; - if (!is_checkpointed_data(sbi, new_addr)) { - dn.data_blkaddr = NULL_ADDR; + if (dn.max_level == 0) + return -ENOENT; + done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len); + blkaddr += done; + do_replace += done; + goto next; + } + + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - + dn.ofs_in_node, len); + for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { + *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (!is_checkpointed_data(sbi, *blkaddr)) { + + if (test_opt(sbi, LFS)) { + f2fs_put_dnode(&dn); + return -ENOTSUPP; + } + /* do not invalidate this block address */ - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - do_replace = true; + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + *do_replace = 1; + } + } + f2fs_put_dnode(&dn); +next: + len -= done; + off += done; + if (len) + goto next_dnode; + return 0; +} + +static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, i; + + for (i = 0; i < len; i++, do_replace++, blkaddr++) { + if (*do_replace == 0) + continue; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + if (ret) { + dec_valid_block_count(sbi, inode, 1); + invalidate_blocks(sbi, *blkaddr); + } else { + f2fs_update_data_blkaddr(&dn, *blkaddr); } f2fs_put_dnode(&dn); } + return 0; +} - if (new_addr == NULL_ADDR) - return full ? truncate_hole(inode, dst, dst + 1) : 0; +static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, + block_t *blkaddr, int *do_replace, + pgoff_t src, pgoff_t dst, pgoff_t len, bool full) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode); + pgoff_t i = 0; + int ret; - if (do_replace) { - struct page *ipage = get_node_page(sbi, inode->i_ino); - struct node_info ni; - - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto err_out; + while (i < len) { + if (blkaddr[i] == NULL_ADDR && !full) { + i++; + continue; } - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, dst); - if (ret) - goto err_out; + if (do_replace[i] || blkaddr[i] == NULL_ADDR) { + struct dnode_of_data dn; + struct node_info ni; + size_t new_size; + pgoff_t ilen; - truncate_data_blocks_range(&dn, 1); + set_new_dnode(&dn, dst_inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + if (ret) + return ret; - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, - ni.version, true); - f2fs_put_dnode(&dn); - } else { - struct page *psrc, *pdst; + get_node_info(sbi, dn.nid, &ni); + ilen = min((pgoff_t) + ADDRS_PER_PAGE(dn.node_page, dst_inode) - + dn.ofs_in_node, len - i); + do { + dn.data_blkaddr = datablock_addr(dn.node_page, + dn.ofs_in_node); + truncate_data_blocks_range(&dn, 1); - psrc = get_lock_data_page(inode, src, true); - if (IS_ERR(psrc)) - return PTR_ERR(psrc); - pdst = get_new_data_page(inode, NULL, dst, false); - if (IS_ERR(pdst)) { + if (do_replace[i]) { + f2fs_i_blocks_write(src_inode, + 1, false); + f2fs_i_blocks_write(dst_inode, + 1, true); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + blkaddr[i], ni.version, true, false); + + do_replace[i] = 0; + } + dn.ofs_in_node++; + i++; + new_size = (dst + i) << PAGE_SHIFT; + if (dst_inode->i_size < new_size) + f2fs_i_size_write(dst_inode, new_size); + } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = get_lock_data_page(src_inode, src + i, true); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = get_new_data_page(dst_inode, NULL, dst + i, + true); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); - return PTR_ERR(pdst); - } - f2fs_copy_page(psrc, pdst); - set_page_dirty(pdst); - f2fs_put_page(pdst, 1); - f2fs_put_page(psrc, 1); - return truncate_hole(inode, src, src + 1); + ret = truncate_hole(src_inode, src + i, src + i + 1); + if (ret) + return ret; + i++; + } + } + return 0; +} + +static int __exchange_data_block(struct inode *src_inode, + struct inode *dst_inode, pgoff_t src, pgoff_t dst, + pgoff_t len, bool full) +{ + block_t *src_blkaddr; + int *do_replace; + pgoff_t olen; + int ret; + + while (len) { + olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); + + src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + if (!src_blkaddr) + return -ENOMEM; + + do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + if (!do_replace) { + kvfree(src_blkaddr); + return -ENOMEM; + } + + ret = __read_out_blkaddrs(src_inode, src_blkaddr, + do_replace, src, olen); + if (ret) + goto roll_back; + + ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr, + do_replace, src, dst, olen, full); + if (ret) + goto roll_back; + + src += olen; + dst += olen; + len -= olen; + + kvfree(src_blkaddr); + kvfree(do_replace); } return 0; -err_out: - if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { - dn.data_blkaddr = new_addr; - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - f2fs_put_dnode(&dn); - } +roll_back: + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len); + kvfree(src_blkaddr); + kvfree(do_replace); return ret; } @@ -915,16 +1052,15 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; - int ret = 0; + int ret; - for (; end < nrpages; start++, end++) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, end, start, true); - f2fs_unlock_op(sbi); - if (ret) - break; - } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + f2fs_unlock_op(sbi); return ret; } @@ -941,16 +1077,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode)); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } - - pg_start = offset >> PAGE_CACHE_SHIFT; - pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -972,7 +1104,50 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = truncate_blocks(inode, new_size, true); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); + + return ret; +} + +static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, + pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + pgoff_t index = start; + unsigned int ofs_in_node = dn->ofs_in_node; + blkcnt_t count = 0; + int ret; + + for (; index < end; index++, dn->ofs_in_node++) { + if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + count++; + } + + dn->ofs_in_node = ofs_in_node; + ret = reserve_new_blocks(dn, count); + if (ret) + return ret; + + dn->ofs_in_node = ofs_in_node; + for (index = start; index < end; index++, dn->ofs_in_node++) { + dn->data_blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + /* + * reserve_new_blocks will not guarantee entire block + * allocation. + */ + if (dn->data_blkaddr == NULL_ADDR) { + ret = -ENOSPC; + break; + } + if (dn->data_blkaddr != NEW_ADDR) { + invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + set_data_blkaddr(dn); + } + } + + f2fs_update_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -991,13 +1166,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - f2fs_balance_fs(sbi); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) @@ -1005,11 +1176,11 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, truncate_pagecache_range(inode, offset, offset + len - 1); - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { ret = fill_zero(inode, pg_start, off_start, @@ -1023,48 +1194,40 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + PAGE_SIZE - off_start); if (ret) return ret; new_size = max_t(loff_t, new_size, - (loff_t)pg_start << PAGE_CACHE_SHIFT); + (loff_t)pg_start << PAGE_SHIFT); } - for (index = pg_start; index < pg_end; index++) { + for (index = pg_start; index < pg_end;) { struct dnode_of_data dn; - struct page *ipage; + unsigned int end_offset; + pgoff_t end; f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - f2fs_unlock_op(sbi); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, index); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); goto out; } - if (dn.data_blkaddr != NEW_ADDR) { - invalidate_blocks(sbi, dn.data_blkaddr); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end = min(pg_end, end_offset - dn.ofs_in_node + index); - dn.data_blkaddr = NEW_ADDR; - set_data_blkaddr(&dn); - - dn.data_blkaddr = NULL_ADDR; - f2fs_update_extent_cache(&dn); - } + ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + if (ret) + goto out; + index = end; new_size = max_t(loff_t, new_size, - (loff_t)(index + 1) << PAGE_CACHE_SHIFT); + (loff_t)index << PAGE_SHIFT); } if (off_end) { @@ -1077,11 +1240,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } out: - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1089,7 +1249,7 @@ out: static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t pg_start, pg_end, delta, nrpages, idx; + pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1104,13 +1264,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(sbi); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + f2fs_balance_fs(sbi, true); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1123,17 +1281,23 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); - pg_start = offset >> PAGE_CACHE_SHIFT; - pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; - nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + + while (!ret && idx > pg_start) { + nr = idx - pg_start; + if (nr > delta) + nr = delta; + idx -= nr; - for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, idx, idx + delta, false); + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, idx, + idx + delta, nr, false); f2fs_unlock_op(sbi); - if (ret) - break; } /* write out all moved pages, if possible */ @@ -1141,7 +1305,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); return ret; } @@ -1149,60 +1313,48 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t index, pg_start, pg_end; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + pgoff_t pg_end; loff_t new_size = i_size_read(inode); - loff_t off_start, off_end; - int ret = 0; - - f2fs_balance_fs(sbi); + loff_t off_end; + int ret; ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); + + pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; + off_end = (offset + len) & (PAGE_SIZE - 1); + + map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; + map.m_len = pg_end - map.m_lblk; + if (off_end) + map.m_len++; + + ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (ret) { + pgoff_t last_off; + + if (!map.m_len) return ret; + + last_off = map.m_lblk + map.m_len - 1; + + /* update new size to the failed position */ + new_size = (last_off == pg_end) ? offset + len: + (loff_t)(last_off + 1) << PAGE_SHIFT; + } else { + new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; - - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); - - f2fs_lock_op(sbi); - - for (index = pg_start; index <= pg_end; index++) { - struct dnode_of_data dn; - - if (index == pg_end && !off_end) - goto noalloc; - - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = f2fs_reserve_block(&dn, index); - if (ret) - break; -noalloc: - if (pg_start == pg_end) - new_size = offset + len; - else if (index == pg_start && off_start) - new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT; - else if (index == pg_end) - new_size = ((loff_t)index << PAGE_CACHE_SHIFT) + - off_end; - else - new_size += PAGE_CACHE_SIZE; - } - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } - f2fs_unlock_op(sbi); + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1226,7 +1378,7 @@ static long f2fs_fallocate(struct file *file, int mode, FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) @@ -1245,11 +1397,12 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; @@ -1257,13 +1410,22 @@ out: static int f2fs_release_file(struct inode *inode, struct file *filp) { + /* + * f2fs_relase_file is called at every close calls. So we should + * not drop any inmemory pages by close called by other process. + */ + if (!(filp->f_mode & FMODE_WRITE) || + atomic_read(&inode->i_writecount) != 1) + return 0; + /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { - set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_VOLATILE_FILE); + set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_DROP_CACHE); } return 0; } @@ -1293,33 +1455,29 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags; unsigned int oldflags; int ret; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *)arg)) + return -EFAULT; + ret = mnt_want_write_file(filp); if (ret) return ret; - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } - - if (get_user(flags, (int __user *)arg)) { - ret = -EFAULT; - goto out; - } - flags = f2fs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto out; } @@ -1328,11 +1486,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); - f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); return ret; @@ -1353,17 +1510,35 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - f2fs_balance_fs(F2FS_I_SB(inode)); - - if (f2fs_is_atomic_file(inode)) - return 0; - - ret = f2fs_convert_inline_inode(inode); + ret = mnt_want_write_file(filp); if (ret) return ret; - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - return 0; + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_ATOMIC_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (!get_dirty_pages(inode)) + goto out; + + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + inode->i_ino, get_dirty_pages(inode)); + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + clear_inode_flag(inode, FI_ATOMIC_FILE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_commit_atomic_write(struct file *filp) @@ -1374,22 +1549,27 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - ret = mnt_want_write_file(filp); if (ret) return ret; + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto err_out; + if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - ret = commit_inmem_pages(inode, false); - if (ret) + clear_inode_flag(inode, FI_ATOMIC_FILE); + ret = commit_inmem_pages(inode); + if (ret) { + set_inode_flag(inode, FI_ATOMIC_FILE); goto err_out; + } } - ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1402,31 +1582,54 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - - ret = f2fs_convert_inline_inode(inode); + ret = mnt_want_write_file(filp); if (ret) return ret; - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - return 0; + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_VOLATILE_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_release_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + if (!f2fs_is_volatile_file(inode)) - return 0; + goto out; - if (!f2fs_is_first_block_written(inode)) - return truncate_partial_data_page(inode, 0, true); + if (!f2fs_is_first_block_written(inode)) { + ret = truncate_partial_data_page(inode, 0, true); + goto out; + } - return punch_hole(inode, 0, F2FS_BLKSIZE); + ret = punch_hole(inode, 0, F2FS_BLKSIZE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_abort_volatile_write(struct file *filp) @@ -1441,13 +1644,19 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - f2fs_balance_fs(F2FS_I_SB(inode)); + inode_lock(inode); - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - commit_inmem_pages(inode, true); + if (f2fs_is_atomic_file(inode)) + drop_inmem_pages(inode); + if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(inode, FI_VOLATILE_FILE); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + } + + inode_unlock(inode); mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -1457,6 +1666,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; __u32 in; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1464,30 +1674,38 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (get_user(in, (__u32 __user *)arg)) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + switch (in) { case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); if (sb && !IS_ERR(sb)) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); thaw_bdev(sb->s_bdev, sb); } break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ f2fs_sync_fs(sb, 1); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: sync_meta_pages(sbi, META, LONG_MAX); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; default: - return -EINVAL; + ret = -EINVAL; + goto out; } - return 0; + f2fs_update_time(sbi, REQ_TIME); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) @@ -1508,15 +1726,21 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) sizeof(range))) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + range.minlen = max((unsigned int)range.minlen, q->limits.discard_granularity); ret = f2fs_trim_fs(F2FS_SB(sb), &range); + mnt_drop_write_file(filp); if (ret < 0) return ret; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1532,45 +1756,31 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_encryption_policy policy; + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - int err; - if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg, - sizeof(policy))) + if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, + sizeof(policy))) return -EFAULT; - mutex_lock(&inode->i_mutex); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - err = f2fs_process_policy(&policy, inode); - - mutex_unlock(&inode->i_mutex); - - return err; -#else - return -EOPNOTSUPP; -#endif + return fscrypt_process_policy(filp, &policy); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_encryption_policy policy; + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int err; - err = f2fs_get_policy(inode, &policy); + err = fscrypt_get_policy(inode, &policy); if (err) return err; - if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy, - sizeof(policy))) + if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) return -EFAULT; return 0; -#else - return -EOPNOTSUPP; -#endif } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) @@ -1593,13 +1803,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) generate_random_uuid(sbi->raw_super->encrypt_pw_salt); err = f2fs_commit_super(sbi, false); - - mnt_drop_write_file(filp); if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + mnt_drop_write_file(filp); return err; } + mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) @@ -1612,6 +1822,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); __u32 sync; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1622,21 +1833,30 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!sync) { - if (!mutex_trylock(&sbi->gc_mutex)) - return -EBUSY; + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } } else { mutex_lock(&sbi->gc_mutex); } - return f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct cp_control cpc; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1644,13 +1864,343 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; - cpc.reason = __get_cp_reason(sbi); + ret = mnt_want_write_file(filp); + if (ret) + return ret; - mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + ret = f2fs_sync_fs(sbi->sb, 1); - return 0; + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + struct extent_info ei; + pgoff_t pg_start, pg_end; + unsigned int blk_per_seg = sbi->blocks_per_seg; + unsigned int total = 0, sec_num; + unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; + block_t blk_end = 0; + bool fragmented = false; + int err; + + /* if in-place-update policy is enabled, don't waste time here */ + if (need_inplace_update(inode)) + return -EINVAL; + + pg_start = range->start >> PAGE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_SHIFT; + + f2fs_balance_fs(sbi, true); + + inode_lock(inode); + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, range->start, + range->start + range->len - 1); + if (err) + goto out; + + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (ei.fofs + ei.len >= pg_end) + goto out; + } + + map.m_lblk = pg_start; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + if (blk_end && blk_end != map.m_pblk) { + fragmented = true; + break; + } + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; + } + + if (!fragmented) + goto out; + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + + sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, 0, sec_num)) { + err = -EAGAIN; + goto out; + } + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + set_inode_flag(inode, FI_DO_DEFRAG); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + struct page *page; + + page = get_lock_data_page(inode, idx, true); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto clear_out; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; + + if (idx < pg_end && cnt < blk_per_seg) + goto do_map; + + clear_inode_flag(inode, FI_DO_DEFRAG); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(inode, FI_DO_DEFRAG); +out: + inode_unlock(inode); + if (!err) + range->len = (u64)total << PAGE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; + goto out; + } + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) { + err = -EFAULT; + goto out; + } + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || + range.len & (F2FS_BLKSIZE - 1)) { + err = -EINVAL; + goto out; + } + + err = f2fs_defragment_range(sbi, filp, &range); + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + goto out; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + err = -EFAULT; +out: + mnt_drop_write_file(filp); + return err; +} + +static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len) +{ + struct inode *src = file_inode(file_in); + struct inode *dst = file_inode(file_out); + struct f2fs_sb_info *sbi = F2FS_I_SB(src); + size_t olen = len, dst_max_i_size = 0; + size_t dst_osize; + int ret; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + src->i_sb != dst->i_sb) + return -EXDEV; + + if (unlikely(f2fs_readonly(src->i_sb))) + return -EROFS; + + if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) + return -EINVAL; + + if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) + return -EOPNOTSUPP; + + if (src == dst) { + if (pos_in == pos_out) + return 0; + if (pos_out > pos_in && pos_out < pos_in + len) + return -EINVAL; + } + + inode_lock(src); + if (src != dst) { + if (!inode_trylock(dst)) { + ret = -EBUSY; + goto out; + } + } + + ret = -EINVAL; + if (pos_in + len > src->i_size || pos_in + len < pos_in) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - pos_in; + if (pos_in + len == src->i_size) + len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in; + if (len == 0) { + ret = 0; + goto out_unlock; + } + + dst_osize = dst->i_size; + if (pos_out + olen > dst->i_size) + dst_max_i_size = pos_out + olen; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_out, F2FS_BLKSIZE)) + goto out_unlock; + + ret = f2fs_convert_inline_inode(src); + if (ret) + goto out_unlock; + + ret = f2fs_convert_inline_inode(dst); + if (ret) + goto out_unlock; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(src->i_mapping, + pos_in, pos_in + len); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(dst->i_mapping, + pos_out, pos_out + len); + if (ret) + goto out_unlock; + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, + pos_out >> F2FS_BLKSIZE_BITS, + len >> F2FS_BLKSIZE_BITS, false); + + if (!ret) { + if (dst_max_i_size) + f2fs_i_size_write(dst, dst_max_i_size); + else if (dst_osize != dst->i_size) + f2fs_i_size_write(dst, dst_osize); + } + f2fs_unlock_op(sbi); +out_unlock: + if (src != dst) + inode_unlock(dst); +out: + inode_unlock(src); + return ret; +} + +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + struct fd dst; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + + dst = fdget(range.dst_fd); + if (!dst.file) + return -EBADF; + + if (!(dst.file->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto err_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto err_out; + + err = f2fs_move_file_range(filp, range.pos_in, dst.file, + range.pos_out, range.len); + + mnt_drop_write_file(filp); + + if (copy_to_user((struct f2fs_move_range __user *)arg, + &range, sizeof(range))) + err = -EFAULT; +err_out: + fdput(dst); + return err; } long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -1686,6 +2236,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_gc(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); + case F2FS_IOC_MOVE_RANGE: + return f2fs_ioc_move_range(filp, arg); default: return -ENOTTY; } @@ -1693,14 +2247,36 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode = file_inode(iocb->ki_filp); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct blk_plug plug; + ssize_t ret; if (f2fs_encrypted_inode(inode) && - !f2fs_has_encryption_key(inode) && - f2fs_get_encryption_info(inode)) + !fscrypt_has_encryption_key(inode) && + fscrypt_get_encryption_info(inode)) return -EACCES; - return generic_file_write_iter(iocb, from); + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret > 0) { + ret = f2fs_preallocate_blocks(iocb, from); + if (!ret) { + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); + } + } + inode_unlock(inode); + + if (ret > 0) { + ssize_t err; + + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + return ret; } #ifdef CONFIG_COMPAT @@ -1713,6 +2289,24 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC32_SETFLAGS: cmd = F2FS_IOC_SETFLAGS; break; + case F2FS_IOC32_GETVERSION: + cmd = F2FS_IOC_GETVERSION; + break; + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_SHUTDOWN: + case F2FS_IOC_SET_ENCRYPTION_POLICY: + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case F2FS_IOC_GET_ENCRYPTION_POLICY: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + break; + case F2FS_IOC_MOVE_RANGE: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fedbf67a0842..0a0a1ad1fe1f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "f2fs.h" #include "node.h" @@ -48,6 +47,11 @@ static int gc_thread_func(void *data) continue; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -97,7 +101,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; int err = 0; - gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) { err = -ENOMEM; goto out; @@ -173,9 +177,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) - return 1 << sbi->log_blocks_per_seg; + return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + return sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -246,6 +250,18 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, return get_cb_cost(sbi, segno); } +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len) +{ + unsigned int end = offset + len, sum = 0; + + while (offset < end) { + if (test_bit(offset++, addr)) + ++sum; + } + return sum; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -259,9 +275,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int secno, max_cost; + unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); - int nsearched = 0; + unsigned int nsearched = 0; mutex_lock(&dirty_i->seglist_lock); @@ -269,11 +285,12 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; - p.min_cost = max_cost = get_max_cost(sbi, &p); + p.min_cost = get_max_cost(sbi, &p); if (p.max_search == 0) goto out; + last_victim = sbi->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -296,27 +313,35 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } p.offset = segno + p.ofs_unit; - if (p.ofs_unit > 1) + if (p.ofs_unit > 1) { p.offset -= segno % p.ofs_unit; + nsearched += count_bits(p.dirty_segmap, + p.offset - p.ofs_unit, + p.ofs_unit); + } else { + nsearched++; + } + secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) - continue; + goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) - continue; + goto next; cost = get_gc_cost(sbi, segno, &p); if (p.min_cost > cost) { p.min_segno = segno; p.min_cost = cost; - } else if (unlikely(cost == max_cost)) { - continue; } - - if (nsearched++ >= p.max_search) { - sbi->last_victim[p.gc_mode] = segno; +next: + if (nsearched >= p.max_search) { + if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) + sbi->last_victim[p.gc_mode] = last_victim + 1; + else + sbi->last_victim[p.gc_mode] = segno + 1; break; } } @@ -400,13 +425,13 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static int gc_node_segment(struct f2fs_sb_info *sbi, +static void gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { - bool initial = true; struct f2fs_summary *entry; block_t start_addr; int off; + int phase = 0; start_addr = START_BLOCK(sbi, segno); @@ -419,16 +444,24 @@ next_step: struct node_info ni; /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return 0; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return; if (check_valid_map(sbi, segno, off) == 0) continue; - if (initial) { + if (phase == 0) { + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { ra_node_page(sbi, nid); continue; } + + /* phase == 2 */ node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) continue; @@ -445,36 +478,12 @@ next_step: continue; } - /* set page dirty and write it */ - if (gc_type == FG_GC) { - f2fs_wait_on_page_writeback(node_page, NODE); - set_page_dirty(node_page); - } else { - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } - f2fs_put_page(node_page, 1); + move_node_page(node_page, gc_type); stat_inc_node_blk_count(sbi, 1, gc_type); } - if (initial) { - initial = false; + if (++phase < 3) goto next_step; - } - - if (gc_type == FG_GC) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_reclaim = 0, - }; - sync_node_pages(sbi, 0, &wbc); - - /* return 1 only if FG_GC succefully reclaimed one */ - if (get_valid_blocks(sbi, segno, 1) == 0) - return 1; - } - return 0; } /* @@ -484,7 +493,7 @@ next_step: * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -501,7 +510,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 5 - dec; } - return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode); } static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -547,6 +556,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) struct f2fs_summary sum; struct node_info ni; struct page *page; + block_t newaddr; int err; /* do not read out */ @@ -568,21 +578,24 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) * don't cache encrypted data into meta inode until previous dirty * data were writebacked to avoid racing between GC and flush. */ - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* read page */ fio.page = page; - fio.blk_addr = dn.data_blkaddr; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), - fio.blk_addr, - FGP_LOCK|FGP_CREAT, - GFP_NOFS); - if (!fio.encrypted_page) - goto put_out; + allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + &sum, CURSEG_COLD_DATA); + + fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + goto recover_block; + } err = f2fs_submit_page_bio(&fio); if (err) @@ -591,33 +604,39 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) /* write page */ lock_page(fio.encrypted_page); - if (unlikely(!PageUptodate(fio.encrypted_page))) + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { + err = -EIO; goto put_page_out; - if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) + } + if (unlikely(!PageUptodate(fio.encrypted_page))) { + err = -EIO; goto put_page_out; + } set_page_dirty(fio.encrypted_page); - f2fs_wait_on_page_writeback(fio.encrypted_page, DATA); + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); set_page_writeback(fio.encrypted_page); /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE); - allocate_data_block(fio.sbi, NULL, fio.blk_addr, - &fio.blk_addr, &sum, CURSEG_COLD_DATA); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true); + fio.rw = WRITE_SYNC; + fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); - dn.data_blkaddr = fio.blk_addr; - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + f2fs_update_data_blkaddr(&dn, newaddr); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); put_page_out: f2fs_put_page(fio.encrypted_page, 1); +recover_block: + if (err) + __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, + true, true); put_out: f2fs_put_dnode(&dn); out: @@ -645,12 +664,23 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .page = page, .encrypted_page = NULL, }; + bool is_dirty = PageDirty(page); + int err; + +retry: set_page_dirty(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); + set_cold_data(page); - do_write_data_page(&fio); + + err = do_write_data_page(&fio); + if (err == -ENOMEM && is_dirty) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + clear_cold_data(page); } out: @@ -664,7 +694,7 @@ out: * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; @@ -684,16 +714,23 @@ next_step: struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; block_t start_bidx; + nid_t nid = le32_to_cpu(entry->nid); /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return 0; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return; if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { - ra_node_page(sbi, le32_to_cpu(entry->nid)); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + ra_node_page(sbi, nid); continue; } @@ -701,14 +738,14 @@ next_step: if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; - if (phase == 1) { + if (phase == 2) { ra_node_page(sbi, dni.ino); continue; } ofs_in_node = le16_to_cpu(entry->ofs_in_node); - if (phase == 2) { + if (phase == 3) { inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode)) continue; @@ -720,7 +757,7 @@ next_step: continue; } - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, start_bidx + ofs_in_node, READA, true); if (IS_ERR(data_page)) { @@ -733,30 +770,41 @@ next_step: continue; } - /* phase 3 */ + /* phase 4 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)) + struct f2fs_inode_info *fi = F2FS_I(inode); + bool locked = false; + + if (S_ISREG(inode->i_mode)) { + if (!down_write_trylock(&fi->dio_rwsem[READ])) + continue; + if (!down_write_trylock( + &fi->dio_rwsem[WRITE])) { + up_write(&fi->dio_rwsem[READ]); + continue; + } + locked = true; + } + + start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) move_encrypted_block(inode, start_bidx); else move_data_page(inode, start_bidx, gc_type); + + if (locked) { + up_write(&fi->dio_rwsem[WRITE]); + up_write(&fi->dio_rwsem[READ]); + } + stat_inc_data_blk_count(sbi, 1, gc_type); } } - if (++phase < 4) + if (++phase < 5) goto next_step; - - if (gc_type == FG_GC) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - - /* return 1 only if FG_GC succefully reclaimed one */ - if (get_valid_blocks(sbi, segno, 1) == 0) - return 1; - } - return 0; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -772,51 +820,84 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, return ret; } -static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, +static int do_garbage_collect(struct f2fs_sb_info *sbi, + unsigned int start_segno, struct gc_inode_list *gc_list, int gc_type) { struct page *sum_page; struct f2fs_summary_block *sum; struct blk_plug plug; - int nfree = 0; + unsigned int segno = start_segno; + unsigned int end_segno = start_segno + sbi->segs_per_sec; + int sec_freed = 0; + unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + SUM_TYPE_DATA : SUM_TYPE_NODE; - /* read segment summary of victim */ - sum_page = get_sum_page(sbi, segno); + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), + sbi->segs_per_sec, META_SSA, true); + + /* reference all summary page */ + while (segno < end_segno) { + sum_page = get_sum_page(sbi, segno++); + unlock_page(sum_page); + } blk_start_plug(&plug); - sum = page_address(sum_page); + for (segno = start_segno; segno < end_segno; segno++) { - /* - * this is to avoid deadlock: - * - lock_page(sum_page) - f2fs_replace_block - * - check_valid_map() - mutex_lock(sentry_lock) - * - mutex_lock(sentry_lock) - change_curseg() - * - lock_page(sum_page) - */ - unlock_page(sum_page); + if (get_valid_blocks(sbi, segno, 1) == 0 || + unlikely(f2fs_cp_error(sbi))) + goto next; - switch (GET_SUM_TYPE((&sum->footer))) { - case SUM_TYPE_NODE: - nfree = gc_node_segment(sbi, sum->entries, segno, gc_type); - break; - case SUM_TYPE_DATA: - nfree = gc_data_segment(sbi, sum->entries, gc_list, - segno, gc_type); - break; + /* find segment summary of victim */ + sum_page = find_get_page(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + f2fs_bug_on(sbi, !PageUptodate(sum_page)); + f2fs_put_page(sum_page, 0); + + sum = page_address(sum_page); + f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); + + /* + * this is to avoid deadlock: + * - lock_page(sum_page) - f2fs_replace_block + * - check_valid_map() - mutex_lock(sentry_lock) + * - mutex_lock(sentry_lock) - change_curseg() + * - lock_page(sum_page) + */ + + if (type == SUM_TYPE_NODE) + gc_node_segment(sbi, sum->entries, segno, gc_type); + else + gc_data_segment(sbi, sum->entries, gc_list, segno, + gc_type); + + stat_inc_seg_count(sbi, type, gc_type); +next: + f2fs_put_page(sum_page, 0); } + + if (gc_type == FG_GC) + f2fs_submit_merged_bio(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + blk_finish_plug(&plug); - stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type); + if (gc_type == FG_GC && + get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + sec_freed = 1; + stat_inc_call_count(sbi->stat_info); - f2fs_put_page(sum_page, 0); - return nfree; + return sec_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) { - unsigned int segno, i; + unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; int ret = -EINVAL; @@ -832,46 +913,48 @@ gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto stop; + } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { gc_type = FG_GC; - if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) - write_checkpoint(sbi, &cpc); + /* + * If there is no victim and no prefree segment but still not + * enough free sections, we should flush dent/node blocks and do + * garbage collections. + */ + if (__get_victim(sbi, &segno, gc_type) || + prefree_segments(sbi)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + segno = NULL_SEGNO; + } else if (has_not_enough_free_secs(sbi, 0, 0)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; - /* readahead multi ssa blocks those have contiguous address */ - if (sbi->segs_per_sec > 1) - ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, - META_SSA, true); - - for (i = 0; i < sbi->segs_per_sec; i++) { - /* - * for FG_GC case, halt gcing left segments once failed one - * of segments in selected section to avoid long latency. - */ - if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) && - gc_type == FG_GC) - break; - } - - if (i == sbi->segs_per_sec && gc_type == FG_GC) + if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && + gc_type == FG_GC) sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) goto gc_more; if (gc_type == FG_GC) - write_checkpoint(sbi, &cpc); + ret = write_checkpoint(sbi, &cpc); } stop: mutex_unlock(&sbi->gc_mutex); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b4a65be9f7d3..a993967dcdb9 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) return true; return false; } - -static inline int is_idle(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; - return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); -} diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index b238d2fec3e5..71b7206c431e 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,8 +70,7 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, - struct f2fs_filename *fname) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -80,10 +79,6 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, const unsigned char *name = name_info->name; size_t len = name_info->len; - /* encrypted bigname case */ - if (fname && !fname->disk_name.name) - return cpu_to_le32(fname->hash); - if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index f35f3eb3541f..b150d03beec1 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -17,9 +17,6 @@ bool f2fs_may_inline_data(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) - return false; - if (f2fs_is_atomic_file(inode)) return false; @@ -55,7 +52,7 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); /* Copy the whole inline data block */ src_addr = inline_data_addr(ipage); @@ -63,7 +60,8 @@ void read_inline_data(struct page *page, struct page *ipage) memcpy(dst_addr, src_addr, MAX_INLINE_DATA); flush_dcache_page(page); kunmap_atomic(dst_addr); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } bool truncate_inline_inode(struct page *ipage, u64 from) @@ -75,9 +73,9 @@ bool truncate_inline_inode(struct page *ipage, u64 from) addr = inline_data_addr(ipage); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); - + set_page_dirty(ipage); return true; } @@ -112,11 +110,12 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) } if (page->index) - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); else read_inline_data(page, ipage); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); f2fs_put_page(ipage, 1); trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE); @@ -126,7 +125,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { - void *src_addr, *dst_addr; struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), .type = DATA, @@ -136,8 +134,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) }; int dirty, err; - f2fs_bug_on(F2FS_I_SB(dn->inode), page->index); - if (!f2fs_exist_data(dn->inode)) goto clear_out; @@ -145,21 +141,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; - f2fs_wait_on_page_writeback(page, DATA); + f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); - if (PageUptodate(page)) - goto no_update; - - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); - - /* Copy the whole inline data block */ - src_addr = inline_data_addr(dn->inode_page); - dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - flush_dcache_page(page); - kunmap_atomic(dst_addr); - SetPageUptodate(page); -no_update: + read_inline_data(page, dn->inode_page); set_page_dirty(page); /* clear dirty state */ @@ -167,23 +151,21 @@ no_update: /* write data page to try to make data consistent */ set_page_writeback(page); - fio.blk_addr = dn->data_blkaddr; + fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); - set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) inode_dec_dirty_pages(dn->inode); /* this converted inline_data should be recovered. */ - set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); + set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ truncate_inline_inode(dn->inode_page, 0); + clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); f2fs_clear_inline_inode(dn->inode); - sync_inode_page(dn); f2fs_put_dnode(dn); return 0; } @@ -195,7 +177,10 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; - page = grab_cache_page(inode->i_mapping, 0); + if (!f2fs_has_inline_data(inode)) + return 0; + + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; @@ -217,6 +202,9 @@ out: f2fs_unlock_op(sbi); f2fs_put_page(page, 1); + + f2fs_balance_fs(sbi, dn.node_changed); + return err; } @@ -238,16 +226,17 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); - f2fs_wait_on_page_writeback(dn.inode_page, NODE); + f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); dst_addr = inline_data_addr(dn.inode_page); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); kunmap_atomic(src_addr); + set_page_dirty(dn.inode_page); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_APPEND_WRITE); + set_inode_flag(inode, FI_DATA_EXIST); - sync_inode_page(&dn); + clear_inline_node(dn.inode_page); f2fs_put_dnode(&dn); return 0; } @@ -276,16 +265,16 @@ process_inline: ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); src_addr = inline_data_addr(npage); dst_addr = inline_data_addr(ipage); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_INLINE_DATA); + set_inode_flag(inode, FI_DATA_EXIST); - update_inode(inode, ipage); + set_page_dirty(ipage); f2fs_put_page(ipage, 1); return true; } @@ -296,7 +285,6 @@ process_inline: if (!truncate_inline_inode(ipage, 0)) return false; f2fs_clear_inline_inode(inode); - update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -307,7 +295,7 @@ process_inline: } struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, - struct f2fs_filename *fname, struct page **res_page) + struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct f2fs_inline_dentry *inline_dentry; @@ -318,10 +306,12 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + *res_page = ipage; return NULL; + } - namehash = f2fs_dentry_hash(&name, fname); + namehash = f2fs_dentry_hash(&name); inline_dentry = inline_data_addr(ipage); @@ -333,30 +323,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, else f2fs_put_page(ipage, 0); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(sbi, d.max < 0); - return de; -} - -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir, - struct page **p) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct page *ipage; - struct f2fs_dir_entry *de; - struct f2fs_inline_dentry *dentry_blk; - - ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) - return NULL; - - dentry_blk = inline_data_addr(ipage); - de = &dentry_blk->dentry[1]; - *p = ipage; - unlock_page(ipage); return de; } @@ -374,10 +340,8 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) { - i_size_write(inode, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + if (i_size_read(inode) < MAX_INLINE_DATA) + f2fs_i_size_write(inode, MAX_INLINE_DATA); return 0; } @@ -385,7 +349,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * NOTE: ipage is grabbed by caller, but if any error occurs, we should * release ipage in this function. */ -static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, +static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { struct page *page; @@ -393,7 +357,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, struct f2fs_dentry_block *dentry_blk; int err; - page = grab_cache_page(dir->i_mapping, 0); + page = f2fs_grab_cache_page(dir->i_mapping, 0, false); if (!page) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -404,8 +368,8 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, if (err) goto out; - f2fs_wait_on_page_writeback(page, DATA); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + f2fs_wait_on_page_writeback(page, DATA, true); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); dentry_blk = kmap_atomic(page); @@ -426,37 +390,132 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, NR_INLINE_DENTRY * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); set_page_dirty(page); /* clear inline dir and flag after data writeback */ truncate_inline_inode(ipage, 0); stat_dec_inline_dir(dir); - clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); + clear_inode_flag(dir, FI_INLINE_DENTRY); - if (i_size_read(dir) < PAGE_CACHE_SIZE) { - i_size_write(dir, PAGE_CACHE_SIZE); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - - sync_inode_page(&dn); + f2fs_i_depth_write(dir, 1); + if (i_size_read(dir) < PAGE_SIZE) + f2fs_i_size_write(dir, PAGE_SIZE); out: f2fs_put_page(page, 1); return err; } -int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, - struct inode *inode, nid_t ino, umode_t mode) +static int f2fs_add_inline_entries(struct inode *dir, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_dentry_ptr d; + unsigned long bit_pos = 0; + int err = 0; + + make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + + while (bit_pos < d.max) { + struct f2fs_dir_entry *de; + struct qstr new_name; + nid_t ino; + umode_t fake_mode; + + if (!test_bit_le(bit_pos, d.bitmap)) { + bit_pos++; + continue; + } + + de = &d.dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + new_name.name = d.filename[bit_pos]; + new_name.len = de->name_len; + + ino = le32_to_cpu(de->ino); + fake_mode = get_de_type(de) << S_SHIFT; + + err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, + ino, fake_mode); + if (err) + goto punch_dentry_pages; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + return 0; +punch_dentry_pages: + truncate_inode_pages(&dir->i_data, 0); + truncate_blocks(dir, 0, false); + remove_dirty_inode(dir); + return err; +} + +static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_inline_dentry *backup_dentry; + int err; + + backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), + sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + if (!backup_dentry) { + f2fs_put_page(ipage, 1); + return -ENOMEM; + } + + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + truncate_inline_inode(ipage, 0); + + unlock_page(ipage); + + err = f2fs_add_inline_entries(dir, backup_dentry); + if (err) + goto recover; + + lock_page(ipage); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + kfree(backup_dentry); + return 0; +recover: + lock_page(ipage); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + f2fs_i_depth_write(dir, 0); + f2fs_i_size_write(dir, MAX_INLINE_DATA); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + + kfree(backup_dentry); + return err; +} + +static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + if (!F2FS_I(dir)->i_dir_level) + return f2fs_move_inline_dirents(dir, ipage, inline_dentry); + else + return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); +} + +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - size_t namelen = name->len; struct f2fs_inline_dentry *dentry_blk = NULL; struct f2fs_dentry_ptr d; - int slots = GET_DENTRY_SLOTS(namelen); + int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; int err = 0; @@ -477,25 +536,27 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, ipage); + page = init_inode_metadata(inode, dir, new_name, + orig_name, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); } - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(name, NULL); + name_hash = f2fs_dentry_hash(new_name); make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); - f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); /* we don't need to mark_inode_dirty now */ if (inode) { - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -503,11 +564,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode(dir, ipage); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } out: f2fs_put_page(ipage, 1); return err; @@ -522,22 +578,22 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, int i; lock_page(page); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); inline_dentry = inline_data_addr(page); bit_pos = dentry - inline_dentry->dentry; for (i = 0; i < slots; i++) - test_and_clear_bit_le(bit_pos + i, + __clear_bit_le(bit_pos + i, &inline_dentry->dentry_bitmap); set_page_dirty(page); + f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, page); - - f2fs_put_page(page, 1); + f2fs_drop_nlink(dir, inode); } bool f2fs_empty_inline_dir(struct inode *dir) @@ -565,7 +621,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) } int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, - struct f2fs_str *fstr) + struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); struct f2fs_inline_dentry *inline_dentry = NULL; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 97e20decacb4..d7369895a78a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "f2fs.h" @@ -18,6 +19,13 @@ #include +void f2fs_mark_inode_dirty_sync(struct inode *inode) +{ + if (f2fs_inode_dirtied(inode)) + return; + mark_inode_dirty_sync(inode); +} + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -35,6 +43,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + f2fs_mark_inode_dirty_sync(inode); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -83,10 +92,10 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) while (start < end) { if (*start++) { - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage)); + set_inode_flag(inode, FI_DATA_EXIST); + set_raw_inline(inode, F2FS_INODE(ipage)); set_page_dirty(ipage); return; } @@ -138,9 +147,10 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, &ri->i_ext); + if (f2fs_init_extent_tree(inode, &ri->i_ext)) + set_page_dirty(node_page); - get_inline_info(fi, ri); + get_inline_info(inode, ri); /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) @@ -150,7 +160,10 @@ static int do_read_inode(struct inode *inode) __get_inode_rdev(inode, ri); if (__written_first_block(ri)) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + if (!need_inode_block_update(sbi, inode->i_ino)) + fi->last_disk_size = inode->i_size; f2fs_put_page(node_page, 1); @@ -202,6 +215,7 @@ make_now: inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { @@ -221,11 +235,27 @@ bad_inode: return ERR_PTR(ret); } -void update_inode(struct inode *inode, struct page *node_page) +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; +retry: + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + } + return inode; +} + +int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(node_page, NODE); + f2fs_inode_synced(inode); + + f2fs_wait_on_page_writeback(node_page, NODE, true); ri = F2FS_INODE(node_page); @@ -242,7 +272,7 @@ void update_inode(struct inode *inode, struct page *node_page) &ri->i_ext); else memset(&ri->i_ext, 0, sizeof(ri->i_ext)); - set_raw_inline(F2FS_I(inode), ri); + set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); @@ -259,15 +289,19 @@ void update_inode(struct inode *inode, struct page *node_page) __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); - set_page_dirty(node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + /* deleted inode */ + if (inode->i_nlink == 0) + clear_inline_node(node_page); + + return set_page_dirty(node_page); } -void update_inode_page(struct inode *inode) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; + int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -276,12 +310,14 @@ retry: cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); } - return; + f2fs_inode_synced(inode); + return 0; } - update_inode(inode, node_page); + ret = update_inode(inode, node_page); f2fs_put_page(node_page, 1); + return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -292,16 +328,15 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; - if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; /* * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - update_inode_page(inode); - - f2fs_balance_fs(sbi); + if (update_inode_page(inode)) + f2fs_balance_fs(sbi, true); return 0; } @@ -311,13 +346,12 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - nid_t xnid = fi->i_xattr_nid; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -327,19 +361,24 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - sb_start_intwrite(inode->i_sb); - set_inode_flag(fi, FI_NO_ALLOC); - i_size_write(inode, 0); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) + goto no_delete; +#endif + sb_start_intwrite(inode->i_sb); + set_inode_flag(inode, FI_NO_ALLOC); + i_size_write(inode, 0); +retry: if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (!err) { f2fs_lock_op(sbi); @@ -347,6 +386,14 @@ void f2fs_evict_inode(struct inode *inode) f2fs_unlock_op(sbi); } + /* give more chances, if ENOMEM case */ + if (err == -ENOMEM) { + err = 0; + goto retry; + } + + if (err) + update_inode_page(inode); sb_end_intwrite(inode->i_sb); no_delete: stat_dec_inline_xattr(inode); @@ -356,36 +403,18 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(fi, FI_APPEND_WRITE)) - add_dirty_inode(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) - add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); - if (is_inode_flag_set(fi, FI_FREE_NID)) { - if (err && err != -ENOENT) - alloc_nid_done(sbi, inode->i_ino); - else - alloc_nid_failed(sbi, inode->i_ino); - clear_inode_flag(fi, FI_FREE_NID); - } - - if (err && err != -ENOENT) { - if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) { - /* - * get here because we failed to release resource - * of inode previously, reminder our user to run fsck - * for fixing. - */ - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "inode (ino:%lu) resource leak, run fsck " - "to fix this issue!", inode->i_ino); - } + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (is_inode_flag_set(inode, FI_FREE_NID)) { + alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(inode, FI_FREE_NID); } + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (fi->i_crypt_info) - f2fs_free_encryption_info(inode, fi->i_crypt_info); -#endif + fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); } @@ -393,37 +422,32 @@ out_clear: void handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int err = 0; + struct node_info ni; - clear_nlink(inode); - make_bad_inode(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); - i_size_write(inode, 0); - if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, false); - - if (!err) - err = remove_inode_page(inode); - /* - * if we skip truncate_node in remove_inode_page bacause we failed - * before, it's better to find another way to release resource of - * this inode (e.g. valid block count, node block or nid). Here we - * choose to add this inode to orphan list, so that we can call iput - * for releasing in orphan recovery flow. - * * Note: we should add inode to orphan list before f2fs_unlock_op() * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - if (err && err != -ENOENT) { - err = acquire_orphan_inode(sbi); - if (!err) - add_orphan_inode(sbi, inode->i_ino); + get_node_info(sbi, inode->i_ino, &ni); + + if (ni.blk_addr != NULL_ADDR) { + int err = acquire_orphan_inode(sbi); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "Too many orphan inodes, run fsck to fix."); + } else { + add_orphan_inode(inode); + } + alloc_nid_done(sbi, inode->i_ino); + } else { + set_inode_flag(inode, FI_FREE_NID); } - set_inode_flag(F2FS_I(inode), FI_FREE_NID); f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2c32110f9fc0..0f071a70522d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -60,10 +60,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - if (f2fs_may_inline_data(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + set_inode_flag(inode, FI_NEW_INODE); + + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + set_inode_flag(inode, FI_INLINE_DENTRY); f2fs_init_extent_tree(inode, NULL); @@ -72,14 +76,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_dir(inode); trace_f2fs_new_inode(inode, 0); - mark_inode_dirty(inode); return inode; fail: trace_f2fs_new_inode(inode, err); make_bad_inode(inode); if (nid_free) - set_inode_flag(F2FS_I(inode), FI_FREE_NID); + set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); } @@ -88,18 +91,23 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); + int i; /* * filename format of multimedia file should be defined as: - * "filename + '.' + extension". + * "filename + '.' + extension + (optional: '.' + temp extension)". */ if (slen < sublen + 2) return 0; - if (s[slen - sublen - 1] != '.') - return 0; + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } - return !strncasecmp(s + slen - sublen, sub, sublen); + return 0; } /* @@ -128,8 +136,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -142,6 +148,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -169,15 +177,15 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, int err; if (f2fs_encrypted_inode(dir) && - !f2fs_is_child_context_consistent_with_parent(dir, inode)) + !fscrypt_has_permitted_context(dir, inode)) return -EPERM; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); inode->i_ctime = CURRENT_TIME; ihold(inode); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -190,7 +198,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_sync_fs(sbi->sb, 1); return 0; out: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); iput(inode); f2fs_unlock_op(sbi); return err; @@ -199,10 +207,14 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); - if (!ino) + struct page *page; + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + if (!ino) { + if (IS_ERR(page)) + return ERR_CAST(page); return ERR_PTR(-ENOENT); - return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino)); + } + return d_obtain_alias(f2fs_iget(child->d_sb, ino)); } static int __recover_dot_dentries(struct inode *dir, nid_t pino) @@ -214,12 +226,24 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) struct page *page; int err = 0; + if (f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "skip recovering inline_dots inode (ino:%lu, pino:%u) " + "in readonly mountpoint", dir->i_ino, pino); + return 0; + } + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); de = f2fs_find_entry(dir, &dot, &page); if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; } else { err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); if (err) @@ -230,14 +254,14 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); } out: - if (!err) { - clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS); - mark_inode_dirty(dir); - } + if (!err) + clear_inode_flag(dir, FI_INLINE_DOTS); f2fs_unlock_op(sbi); return err; @@ -251,13 +275,32 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct page *page; nid_t ino; int err = 0; + unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); + + if (f2fs_encrypted_inode(dir)) { + int res = fscrypt_get_encryption_info(dir); + + /* + * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is + * created while the directory was encrypted and we + * don't have access to the key. + */ + if (fscrypt_has_encryption_key(dir)) + fscrypt_set_encrypted_dentry(dentry); + fscrypt_set_d_op(dentry); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } if (dentry->d_name.len > F2FS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + return (struct dentry *)page; return d_splice_alias(inode, dentry); + } ino = le32_to_cpu(de->ino); f2fs_dentry_kunmap(dir, page); @@ -267,15 +310,29 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) return ERR_CAST(inode); + if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) { + err = __recover_dot_dentries(dir, root_ino); + if (err) + goto err_out; + } + if (f2fs_has_inline_dots(inode)) { err = __recover_dot_dentries(inode, dir->i_ino); if (err) goto err_out; } + if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + bool nokey = f2fs_encrypted_inode(inode) && + !fscrypt_has_encryption_key(inode); + err = nokey ? -ENOKEY : -EPERM; + goto err_out; + } return d_splice_alias(inode, dentry); err_out: - iget_failed(inode); + iput(inode); return ERR_PTR(err); } @@ -288,11 +345,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) int err = -ENOENT; trace_f2fs_unlink_enter(dir, dentry); - f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + err = PTR_ERR(page); goto fail; + } + + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); @@ -305,9 +366,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_delete_entry(de, page, dir, inode); f2fs_unlock_op(sbi); - /* In order to evict this inode, we set it dirty */ - mark_inode_dirty(inode); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: @@ -332,16 +390,24 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; size_t len = strlen(symname); - size_t p_len; - char *p_str; - struct f2fs_str disk_link = FSTR_INIT(NULL, 0); - struct f2fs_encrypted_symlink_data *sd = NULL; + struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1); + struct fscrypt_symlink_data *sd = NULL; int err; - if (len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + if (f2fs_encrypted_inode(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) + return err; - f2fs_balance_fs(sbi); + if (!fscrypt_has_encryption_key(dir)) + return -EPERM; + + disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + + sizeof(struct fscrypt_symlink_data)); + } + + if (disk_link.len > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) @@ -351,8 +417,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -360,42 +429,36 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); - if (f2fs_encrypted_inode(dir)) { + if (f2fs_encrypted_inode(inode)) { struct qstr istr = QSTR_INIT(symname, len); + struct fscrypt_str ostr; - err = f2fs_get_encryption_info(inode); - if (err) - goto err_out; - - err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link); - if (err) - goto err_out; - - err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link); - if (err < 0) - goto err_out; - - p_len = encrypted_symlink_data_len(disk_link.len) + 1; - - if (p_len > dir->i_sb->s_blocksize) { - err = -ENAMETOOLONG; - goto err_out; - } - - sd = kzalloc(p_len, GFP_NOFS); + sd = kzalloc(disk_link.len, GFP_NOFS); if (!sd) { err = -ENOMEM; goto err_out; } - memcpy(sd->encrypted_path, disk_link.name, disk_link.len); - sd->len = cpu_to_le16(disk_link.len); - p_str = (char *)sd; - } else { - p_len = len + 1; - p_str = (char *)symname; + + err = fscrypt_get_encryption_info(inode); + if (err) + goto err_out; + + if (!fscrypt_has_encryption_key(inode)) { + err = -EPERM; + goto err_out; + } + + ostr.name = sd->encrypted_path; + ostr.len = disk_link.len; + err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); + if (err < 0) + goto err_out; + + sd->len = cpu_to_le16(ostr.len); + disk_link.name = (char *)sd; } - err = page_symlink(inode, p_str, p_len); + err = page_symlink(inode, disk_link.name, disk_link.len); err_out: d_instantiate(dentry, inode); @@ -411,7 +474,8 @@ err_out: * performance regression. */ if (!err) { - filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); + filemap_write_and_wait_range(inode->i_mapping, 0, + disk_link.len - 1); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -420,7 +484,6 @@ err_out: } kfree(sd); - f2fs_fname_crypto_free_buffer(&disk_link); return err; out: handle_failed_inode(inode); @@ -433,8 +496,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -444,7 +505,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + f2fs_balance_fs(sbi, true); + + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -461,7 +524,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; out_fail: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); handle_failed_inode(inode); return err; } @@ -481,8 +544,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -490,6 +551,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -516,9 +579,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - if (!whiteout) - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -532,6 +592,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -545,17 +607,17 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, * add this non-linked tmpfile to orphan list, in this way we could * remove all unused data of tmpfile after abnormal power-off. */ - add_orphan_inode(sbi, inode->i_ino); - f2fs_unlock_op(sbi); - + add_orphan_inode(inode); alloc_nid_done(sbi, inode->i_ino); if (whiteout) { - inode_dec_link_count(inode); + f2fs_i_links_write(inode, false); *whiteout = inode; } else { d_tmpfile(dentry, inode); } + /* link_count was changed by d_tmpfile as well. */ + f2fs_unlock_op(sbi); unlock_new_inode(inode); return 0; @@ -569,7 +631,7 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { if (f2fs_encrypted_inode(dir)) { - int err = f2fs_get_encryption_info(dir); + int err = fscrypt_get_encryption_info(dir); if (err) return err; } @@ -595,26 +657,29 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; + bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && - !f2fs_is_child_context_consistent_with_parent(new_dir, - old_inode)) { + !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; goto out; } - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_old; + } } if (flags & RENAME_WHITEOUT) { @@ -632,8 +697,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_whiteout; + } + + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -641,8 +711,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - if (update_dent_inode(old_inode, new_inode, - &new_dentry->d_name)) { + err = update_dent_inode(old_inode, new_inode, + &new_dentry->d_name); + if (err) { release_orphan_inode(sbi); goto put_out_dir; } @@ -652,20 +723,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) - drop_nlink(new_inode); - drop_nlink(new_inode); + f2fs_i_links_write(new_inode, false); + f2fs_i_links_write(new_inode, false); up_write(&F2FS_I(new_inode)->i_sem); - mark_inode_dirty(new_inode); - if (!new_inode->i_nlink) - add_orphan_inode(sbi, new_inode->i_ino); + add_orphan_inode(new_inode); else release_orphan_inode(sbi); - - update_inode_page(old_inode); - update_inode_page(new_inode); } else { + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(new_dentry, old_inode); @@ -674,9 +742,29 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_whiteout; } - if (old_dir_entry) { - inc_nlink(new_dir); - update_inode_page(new_dir); + if (old_dir_entry) + f2fs_i_links_write(new_dir, true); + + /* + * old entry and new entry can locate in the same inline + * dentry in inode, when attaching new entry in inline dentry, + * it could force inline dentry conversion, after that, + * old_entry and old_page will point to wrong address, in + * order to avoid this, let's do the check and update here. + */ + if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) { + f2fs_put_page(old_page, 0); + old_page = NULL; + + old_entry = f2fs_find_entry(old_dir, + &old_dentry->d_name, &old_page); + if (!old_entry) { + err = -ENOENT; + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); + f2fs_unlock_op(sbi); + goto out_whiteout; + } } } @@ -687,13 +775,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(old_inode); + f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); if (whiteout) { whiteout->i_state |= I_LINKABLE; - set_inode_flag(F2FS_I(whiteout), FI_INC_LINK); + set_inode_flag(whiteout, FI_INC_LINK); err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; @@ -705,14 +793,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir && !whiteout) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - update_inode_page(old_inode); } else { f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } - drop_nlink(old_dir); - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_i_links_write(old_dir, false); } f2fs_unlock_op(sbi); @@ -756,39 +841,45 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int err = -ENOENT; if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && - (old_dir != new_dir) && - (!f2fs_is_child_context_consistent_with_parent(new_dir, - old_inode) || - !f2fs_is_child_context_consistent_with_parent(old_dir, - new_inode))) + (old_dir != new_dir) && + (!fscrypt_has_permitted_context(new_dir, old_inode) || + !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_old; + } /* prepare for updating ".." directory entry info later */ if (old_dir != new_dir) { if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_new; + } } if (S_ISDIR(new_inode->i_mode)) { - err = -EIO; new_dir_entry = f2fs_parent_dir(new_inode, &new_dir_page); - if (!new_dir_entry) + if (!new_dir_entry) { + if (IS_ERR(new_dir_page)) + err = PTR_ERR(new_dir_page); goto out_old_dir; + } } } @@ -807,6 +898,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_new_dir; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); @@ -836,19 +929,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - update_inode_page(old_inode); - old_dir->i_ctime = CURRENT_TIME; if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); - if (old_nlink < 0) - drop_nlink(old_dir); - else - inc_nlink(old_dir); + f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_mark_inode_dirty_sync(old_dir); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -857,19 +944,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - update_inode_page(new_inode); - new_dir->i_ctime = CURRENT_TIME; if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); - if (new_nlink < 0) - drop_nlink(new_dir); - else - inc_nlink(new_dir); + f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - mark_inode_dirty(new_dir); - update_inode_page(new_dir); + f2fs_mark_inode_dirty_sync(new_dir); f2fs_unlock_op(sbi); @@ -922,89 +1003,85 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -#ifdef CONFIG_F2FS_FS_ENCRYPTION static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie) { struct page *cpage = NULL; char *caddr, *paddr = NULL; - struct f2fs_str cstr; - struct f2fs_str pstr = FSTR_INIT(NULL, 0); + struct fscrypt_str cstr = FSTR_INIT(NULL, 0); + struct fscrypt_str pstr = FSTR_INIT(NULL, 0); + struct fscrypt_symlink_data *sd; struct inode *inode = d_inode(dentry); - struct f2fs_encrypted_symlink_data *sd; - loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); u32 max_size = inode->i_sb->s_blocksize; int res; - res = f2fs_get_encryption_info(inode); + if (!dentry) + return ERR_PTR(-ECHILD); + + res = fscrypt_get_encryption_info(inode); if (res) return ERR_PTR(res); cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) return ERR_CAST(cpage); - caddr = kmap(cpage); - caddr[size] = 0; + caddr = page_address(cpage); /* Symlink is encrypted */ - sd = (struct f2fs_encrypted_symlink_data *)caddr; + sd = (struct fscrypt_symlink_data *)caddr; + cstr.name = sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); - cstr.name = kmalloc(cstr.len, GFP_NOFS); - if (!cstr.name) { - res = -ENOMEM; - goto errout; - } - memcpy(cstr.name, sd->encrypted_path, cstr.len); /* this is broken symlink case */ - if (cstr.name[0] == 0 && cstr.len == 0) { + if (unlikely(cstr.len == 0)) { res = -ENOENT; goto errout; } - if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) > - max_size) { + if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { /* Symlink data on the disk is corrupted */ res = -EIO; goto errout; } - res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr); + res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); if (res) goto errout; - res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr); + res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (res < 0) goto errout; - kfree(cstr.name); + /* this is broken symlink case */ + if (unlikely(pstr.name[0] == 0)) { + res = -ENOENT; + goto errout; + } paddr = pstr.name; /* Null-terminate the name */ paddr[res] = '\0'; - kunmap(cpage); - page_cache_release(cpage); + put_page(cpage); return *cookie = paddr; errout: - kfree(cstr.name); - f2fs_fname_crypto_free_buffer(&pstr); - kunmap(cpage); - page_cache_release(cpage); + fscrypt_fname_free_buffer(&pstr); + put_page(cpage); return ERR_PTR(res); } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = f2fs_encrypted_follow_link, - .put_link = kfree_put_link, + .follow_link = f2fs_encrypted_follow_link, + .put_link = kfree_put_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, -}; #endif +}; const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7bcbc6e9c40d..b1e615ed2bef 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -46,12 +46,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) */ if (type == FREE_NIDS) { mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + if (excess_cached_nats(sbi)) + res = false; } else if (type == DIRTY_DENTS) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; @@ -62,16 +64,17 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) for (i = 0; i <= UPDATE_INO; i++) mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; + sizeof(struct ino_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { - mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + mem_size = (atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * - sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; + sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->wb.dirty_exceeded) - return false; + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; } return res; } @@ -120,7 +123,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) src_addr = page_address(src_page); dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + memcpy(dst_addr, src_addr, PAGE_SIZE); set_page_dirty(dst_page); f2fs_put_page(src_page, 1); @@ -256,18 +259,21 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) return new; } -static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, +static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry *ne) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) { e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); + } else { + f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || + nat_get_blkaddr(e) != ne->block_addr || + nat_get_version(e) != ne->version); } - up_write(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -355,7 +361,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; nid_t start_nid = START_NID(nid); struct f2fs_nat_block *nat_blk; struct page *page = NULL; @@ -372,21 +378,20 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - } - up_read(&nm_i->nat_tree_lock); - if (e) + up_read(&nm_i->nat_tree_lock); return; + } memset(&ne, 0, sizeof(struct f2fs_nat_entry)); /* Check current segment summary */ - mutex_lock(&curseg->curseg_mutex); - i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); + down_read(&curseg->journal_rwsem); + i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { - ne = nat_in_journal(sum, i); + ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - mutex_unlock(&curseg->curseg_mutex); + up_read(&curseg->journal_rwsem); if (i >= 0) goto cache; @@ -397,18 +402,75 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: + up_read(&nm_i->nat_tree_lock); /* cache nat entry */ - cache_nat_entry(NM_I(sbi), nid, &ne); + down_write(&nm_i->nat_tree_lock); + cache_nat_entry(sbi, nid, &ne); + up_write(&nm_i->nat_tree_lock); +} + +/* + * readahead MAX_RA_NODE number of node pages. + */ +static void ra_node_pages(struct page *parent, int start, int n) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + n; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); +} + +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) +{ + const long direct_index = ADDRS_PER_INODE(dn->inode); + const long direct_blks = ADDRS_PER_BLOCK; + const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; + unsigned int skipped_unit = ADDRS_PER_BLOCK; + int cur_level = dn->cur_level; + int max_level = dn->max_level; + pgoff_t base = 0; + + if (!dn->max_level) + return pgofs + 1; + + while (max_level-- > cur_level) + skipped_unit *= NIDS_PER_BLOCK; + + switch (dn->max_level) { + case 3: + base += 2 * indirect_blks; + case 2: + base += 2 * direct_blks; + case 1: + base += direct_index; + break; + default: + f2fs_bug_on(F2FS_I_SB(dn->inode), 1); + } + + return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base; } /* * The maximum depth is four. * Offset[0] will have raw inode offset. */ -static int get_node_path(struct f2fs_inode_info *fi, long block, +static int get_node_path(struct inode *inode, long block, int offset[4], unsigned int noffset[4]) { - const long direct_index = ADDRS_PER_INODE(fi); + const long direct_index = ADDRS_PER_INODE(inode); const long direct_blks = ADDRS_PER_BLOCK; const long dptrs_per_blk = NIDS_PER_BLOCK; const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; @@ -493,10 +555,10 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int offset[4]; unsigned int noffset[4]; nid_t nids[4]; - int level, i; + int level, i = 0; int err = 0; - level = get_node_path(F2FS_I(dn->inode), index, offset, noffset); + level = get_node_path(dn->inode, index, offset, noffset); nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -583,6 +645,11 @@ release_pages: release_out: dn->inode_page = NULL; dn->node_page = NULL; + if (err == -ENOENT) { + dn->cur_level = i; + dn->max_level = level; + dn->ofs_in_node = offset[level]; + } return err; } @@ -606,8 +673,7 @@ static void truncate_node(struct dnode_of_data *dn) if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); dec_valid_inode_count(sbi); - } else { - sync_inode_page(dn); + f2fs_inode_synced(dn->inode); } invalidate: clear_node_page_dirty(dn->node_page); @@ -666,6 +732,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, return PTR_ERR(page); } + ra_node_pages(page, ofs, NIDS_PER_BLOCK); + rn = F2FS_NODE(page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { @@ -676,7 +744,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, ret = truncate_dnode(&rdn); if (ret < 0) goto out_err; - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; } } else { child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; @@ -689,7 +758,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, rdn.nid = child_nid; ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); if (ret == (NIDS_PER_BLOCK + 1)) { - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; child_nofs += ret; } else if (ret < 0 && ret != -ENOENT) { goto out_err; @@ -741,6 +811,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } + ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); @@ -750,7 +822,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, err = truncate_dnode(dn); if (err < 0) goto fail; - set_nid(pages[idx], i, 0, false); + if (set_nid(pages[idx], i, 0, false)) + dn->node_changed = true; } if (offset[idx + 1] == 0) { @@ -787,8 +860,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); - level = get_node_path(F2FS_I(inode), from, offset, noffset); -restart: + level = get_node_path(inode, from, offset, noffset); + page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); @@ -852,11 +925,8 @@ skip_partial: if (offset[1] == 0 && ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto restart; - } - f2fs_wait_on_page_writeback(page, NODE); + BUG_ON(page->mapping != NODE_MAPPING(sbi)); + f2fs_wait_on_page_writeback(page, NODE, true); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); @@ -885,7 +955,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page) if (IS_ERR(npage)) return PTR_ERR(npage); - F2FS_I(inode)->i_xattr_nid = 0; + f2fs_i_xnid_write(inode, 0); /* need to do checkpoint during fsync */ F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); @@ -951,10 +1021,10 @@ struct page *new_node_page(struct dnode_of_data *dn, struct page *page; int err; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -971,23 +1041,19 @@ struct page *new_node_page(struct dnode_of_data *dn, new_ni.ino = dn->inode->i_ino; set_node_addr(sbi, &new_ni, NEW_ADDR, false); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); - SetPageUptodate(page); - set_page_dirty(page); + if (!PageUptodate(page)) + SetPageUptodate(page); + if (set_page_dirty(page)) + dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) - F2FS_I(dn->inode)->i_xattr_nid = dn->nid; + f2fs_i_xnid_write(dn->inode, dn->nid); - dn->node_page = page; - if (ipage) - update_inode(dn->inode, ipage); - else - sync_inode_page(dn); if (ofs == 0) inc_valid_inode_count(sbi); - return page; fail: @@ -1013,6 +1079,9 @@ static int read_node_page(struct page *page, int rw) .encrypted_page = NULL, }; + if (PageUptodate(page)) + return LOCKED_PAGE; + get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { @@ -1020,10 +1089,7 @@ static int read_node_page(struct page *page, int rw) return -ENOENT; } - if (PageUptodate(page)) - return LOCKED_PAGE; - - fio.blk_addr = ni.blk_addr; + fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr; return f2fs_submit_page_bio(&fio); } @@ -1035,14 +1101,17 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *apage; int err; - apage = find_get_page(NODE_MAPPING(sbi), nid); - if (apage && PageUptodate(apage)) { - f2fs_put_page(apage, 0); + if (!nid) return; - } - f2fs_put_page(apage, 0); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); - apage = grab_cache_page(NODE_MAPPING(sbi), nid); + rcu_read_lock(); + apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); + rcu_read_unlock(); + if (apage) + return; + + apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!apage) return; @@ -1050,53 +1119,17 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start) { struct page *page; int err; -repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); - if (!page) - return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); - if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } else if (err != LOCKED_PAGE) { - lock_page(page); - } - - if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto repeat; - } - return page; -} - -/* - * Return a locked page for the desired node page. - * And, readahead MAX_RA_NODE number of node pages. - */ -struct page *get_node_page_ra(struct page *parent, int start) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; - struct page *page; - int err, i, end; - nid_t nid; - - /* First, try getting the desired direct node. */ - nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -1108,61 +1141,116 @@ repeat: goto page_hit; } - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start + 1; i < end; i++) { - nid = get_nid(parent, i, false); - if (!nid) - continue; - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); + if (parent) + ra_node_pages(parent, start + 1, MAX_RA_NODE); lock_page(page); + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } + + if (unlikely(!PageUptodate(page))) + goto out_err; page_hit: - if (unlikely(!PageUptodate(page))) { + if(unlikely(nid != nid_of_node(page))) { + f2fs_bug_on(sbi, 1); + ClearPageUptodate(page); +out_err: f2fs_put_page(page, 1); return ERR_PTR(-EIO); } return page; } -void sync_inode_page(struct dnode_of_data *dn) +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - update_inode(dn->inode, dn->node_page); - } else if (dn->inode_page) { - if (!dn->inode_page_locked) - lock_page(dn->inode_page); - update_inode(dn->inode, dn->inode_page); - if (!dn->inode_page_locked) - unlock_page(dn->inode_page); - } else { - update_inode_page(dn->inode); - } + return __get_node_page(sbi, nid, NULL, 0); } -int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, - struct writeback_control *wbc) +struct page *get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_page(sbi, nid, parent, start); +} + +static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode; + struct page *page; + int ret; + + /* should flush inline_data before evict_inode */ + inode = ilookup(sbi->sb, ino); + if (!inode) + return; + + page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); + if (!page) + goto iput_out; + + if (!PageUptodate(page)) + goto page_out; + + if (!PageDirty(page)) + goto page_out; + + if (!clear_page_dirty_for_io(page)) + goto page_out; + + ret = f2fs_write_inline_data(inode, page); + inode_dec_dirty_pages(inode); + if (ret) + set_page_dirty(page); +page_out: + f2fs_put_page(page, 1); +iput_out: + iput(inode); +} + +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(sbi, PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + +static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index, end; struct pagevec pvec; - int step = ino ? 2 : 0; - int nwritten = 0, wrote = 0; + struct page *last_page = NULL; pagevec_init(&pvec, 0); - -next_step: index = 0; - end = LONG_MAX; + end = ULONG_MAX; while (index <= end) { int i, nr_pages; @@ -1175,6 +1263,190 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return ERR_PTR(-EIO); + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (last_page) + f2fs_put_page(last_page, 0); + + get_page(page); + last_page = page; + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return last_page; +} + +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic) +{ + pgoff_t index, end; + struct pagevec pvec; + int ret = 0; + struct page *last_page = NULL; + bool marked = false; + nid_t ino = inode->i_ino; + int nwritten = 0; + + if (atomic) { + last_page = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_page)) + return PTR_ERR_OR_ZERO(last_page); + } +retry: + pagevec_init(&pvec, 0); + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return -EIO; + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page) && page != last_page) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + BUG_ON(PageWriteback(page)); + + if (!atomic || page == last_page) { + set_fsync_mark(page, 1); + if (IS_INODE(page)) { + if (is_inode_flag_set(inode, + FI_DIRTY_INODE)) + update_inode(inode, page); + set_dentry_mark(page, + need_dentry_mark(sbi, ino)); + } + /* may be written by other thread */ + if (!PageDirty(page)) + set_page_dirty(page); + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + if (ret) { + unlock_page(page); + f2fs_put_page(last_page, 0); + break; + } else { + nwritten++; + } + + if (page == last_page) { + f2fs_put_page(page, 0); + marked = true; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + + if (ret || marked) + break; + } + if (!ret && atomic && !marked) { + f2fs_msg(sbi->sb, KERN_DEBUG, + "Retry to write fsync mark: ino=%u, idx=%lx", + ino, last_page->index); + lock_page(last_page); + set_page_dirty(last_page); + unlock_page(last_page); + goto retry; + } + + if (nwritten) + f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); + return ret ? -EIO: 0; +} + +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +{ + pgoff_t index, end; + struct pagevec pvec; + int step = 0; + int nwritten = 0; + int ret = 0; + + pagevec_init(&pvec, 0); + +next_step: + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + pagevec_release(&pvec); + ret = -EIO; + goto out; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1189,14 +1461,8 @@ next_step: if (step == 2 && (!IS_DNODE(page) || !is_cold_node(page))) continue; - - /* - * If an fsync mode, - * we should not skip writing node pages. - */ - if (ino && ino_of_node(page) == ino) - lock_page(page); - else if (!trylock_page(page)) +lock_node: + if (!trylock_page(page)) continue; if (unlikely(page->mapping != NODE_MAPPING(sbi))) { @@ -1204,33 +1470,33 @@ continue_unlock: unlock_page(page); continue; } - if (ino && ino_of_node(page) != ino) - goto continue_unlock; if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } + /* flush inline_data */ + if (is_inline_node(page)) { + clear_inline_node(page); + unlock_page(page); + flush_inline_data(sbi, ino_of_node(page)); + goto lock_node; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + + BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; - /* called by fsync() */ - if (ino && IS_DNODE(page)) { - set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, - need_dentry_mark(sbi, ino)); - nwritten++; - } else { - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); - } + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) unlock_page(page); else - wrote++; + nwritten++; if (--wbc->nr_to_write == 0) break; @@ -1248,15 +1514,15 @@ continue_unlock: step++; goto next_step; } - - if (wrote) +out: + if (nwritten) f2fs_submit_merged_bio(sbi, NODE, WRITE); - return nwritten; + return ret; } int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index = 0, end = LONG_MAX; + pgoff_t index = 0, end = ULONG_MAX; struct pagevec pvec; int ret2 = 0, ret = 0; @@ -1278,7 +1544,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) continue; if (ino && ino_of_node(page) == ino) { - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); if (TestClearPageError(page)) ret = -EIO; } @@ -1317,8 +1583,6 @@ static int f2fs_write_node_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - f2fs_wait_on_page_writeback(page, NODE); - /* get old block addr of this node page */ nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); @@ -1342,14 +1606,18 @@ static int f2fs_write_node_page(struct page *page, } set_page_writeback(page); - fio.blk_addr = ni.blk_addr; + fio.old_blkaddr = ni.blk_addr; write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page)); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); - unlock_page(page); if (wbc->for_reclaim) + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, NODE, WRITE); return 0; @@ -1363,10 +1631,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; long diff; - trace_f2fs_writepages(mapping->host, wbc, NODE); - /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1374,14 +1641,19 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, NODE); + diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; - sync_node_pages(sbi, 0, wbc); + blk_start_plug(&plug); + sync_node_pages(sbi, wbc); + blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; skip_write: wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); + trace_f2fs_writepages(mapping->host, wbc, NODE); return 0; } @@ -1389,9 +1661,10 @@ static int f2fs_set_node_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, NODE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); f2fs_trace_pid(page); @@ -1409,6 +1682,9 @@ const struct address_space_operations f2fs_node_aops = { .set_page_dirty = f2fs_set_node_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, @@ -1429,7 +1705,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; - bool allocated = false; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1440,14 +1715,9 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (build) { /* do not add allocated nids */ - down_read(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && - (!get_nat_flag(ne, IS_CHECKPOINTED) || + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) - allocated = true; - up_read(&nm_i->nat_tree_lock); - if (allocated) return 0; } @@ -1516,22 +1786,24 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -static void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i = 0; nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); + down_read(&nm_i->nat_tree_lock); + while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1550,16 +1822,19 @@ static void build_free_nids(struct f2fs_sb_info *sbi) nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < nats_in_cursum(sum); i++) { - block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); - nid = le32_to_cpu(nid_in_journal(sum, i)); + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else remove_free_nid(nm_i, nid); } - mutex_unlock(&curseg->curseg_mutex); + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -1575,6 +1850,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ALLOC_NID)) + return false; +#endif if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; @@ -1582,8 +1861,6 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - struct node_info ni; - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1594,13 +1871,6 @@ retry: i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); - - /* check nid is allocated already */ - get_node_info(sbi, *nid, &ni); - if (ni.blk_addr != NULL_ADDR) { - alloc_nid_done(sbi, *nid); - goto retry; - } return true; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1663,12 +1933,15 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; + if (nm_i->fcnt <= MAX_FREE_NIDS) + return 0; + if (!mutex_trylock(&nm_i->build_lock)) return 0; spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK) + if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) break; if (i->state == NID_ALLOC) continue; @@ -1695,7 +1968,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) ri = F2FS_INODE(page); if (!(ri->i_inline & F2FS_INLINE_XATTR)) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR); + clear_inode_flag(inode, FI_INLINE_XATTR); goto update_inode; } @@ -1703,7 +1976,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) src_addr = inline_xattr_addr(page); inline_size = inline_xattr_size(inode); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); memcpy(dst_addr, src_addr, inline_size); update_inode: update_inode(inode, ipage); @@ -1737,13 +2010,11 @@ recover_xnid: get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); - F2FS_I(inode)->i_xattr_nid = new_xnid; + f2fs_i_xnid_write(inode, new_xnid); /* 3: update xattr blkaddr */ refresh_sit_entry(sbi, NEW_ADDR, blkaddr); set_node_addr(sbi, &ni, blkaddr, false); - - update_inode_page(inode); } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1757,15 +2028,18 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; - - ipage = grab_cache_page(NODE_MAPPING(sbi), ino); - if (!ipage) - return -ENOMEM; +retry: + ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); + if (!ipage) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); - SetPageUptodate(ipage); + if (!PageUptodate(ipage)) + SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); src = F2FS_INODE(page); @@ -1831,28 +2105,26 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i; - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < nats_in_cursum(sum); i++) { + down_write(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { struct nat_entry *ne; struct f2fs_nat_entry raw_ne; - nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); + nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); - raw_ne = nat_in_journal(sum, i); + raw_ne = nat_in_journal(journal, i); - down_write(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (!ne) { ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } __set_nat_cache_dirty(nm_i, ne); - up_write(&nm_i->nat_tree_lock); } - update_nats_in_cursum(sum, -i); - mutex_unlock(&curseg->curseg_mutex); + update_nats_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); } static void __adjust_nat_entry_set(struct nat_entry_set *nes, @@ -1877,24 +2149,23 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct nat_entry_set *set) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; bool to_journal = true; struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; - struct f2fs_nm_info *nm_i = NM_I(sbi); /* * there are two steps to flush nat entries: * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) + if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { - mutex_lock(&curseg->curseg_mutex); + down_write(&curseg->journal_rwsem); } else { page = get_next_nat_page(sbi, start_nid); nat_blk = page_address(page); @@ -1911,35 +2182,29 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, continue; if (to_journal) { - offset = lookup_journal_in_cursum(sum, + offset = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 1); f2fs_bug_on(sbi, offset < 0); - raw_ne = &nat_in_journal(sum, offset); - nid_in_journal(sum, offset) = cpu_to_le32(nid); + raw_ne = &nat_in_journal(journal, offset); + nid_in_journal(journal, offset) = cpu_to_le32(nid); } else { raw_ne = &nat_blk->entries[nid - start_nid]; } raw_nat_from_node_info(raw_ne, &ne->ni); - - down_write(&NM_I(sbi)->nat_tree_lock); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - up_write(&NM_I(sbi)->nat_tree_lock); - if (nat_get_blkaddr(ne) == NULL_ADDR) add_free_nid(sbi, nid, false); } if (to_journal) - mutex_unlock(&curseg->curseg_mutex); + up_write(&curseg->journal_rwsem); else f2fs_put_page(page, 1); f2fs_bug_on(sbi, set->entry_cnt); - down_write(&nm_i->nat_tree_lock); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - up_write(&nm_i->nat_tree_lock); kmem_cache_free(nat_entry_set_slab, set); } @@ -1950,7 +2215,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; struct nat_entry_set *setvec[SETVEC_SIZE]; struct nat_entry_set *set, *tmp; unsigned int found; @@ -1959,29 +2224,32 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; + + down_write(&nm_i->nat_tree_lock); + /* * if there are no enough space in journal to store dirty nat * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); - down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, - MAX_NAT_JENTRIES(sum)); + MAX_NAT_JENTRIES(journal)); } - up_write(&nm_i->nat_tree_lock); /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); + up_write(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -2006,6 +2274,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; + nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e4fffd2d98c4..868bec65e51c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -15,15 +15,21 @@ #define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ -#define FREE_NID_PAGES 4 +#define FREE_NID_PAGES 8 +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) -#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */ +#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 /* control the memory footprint threshold (10MB per 1GB ram) */ -#define DEF_RAM_THRESHOLD 10 +#define DEF_RAM_THRESHOLD 1 + +/* control dirty nats ratio threshold (default: 10% over max nid count) */ +#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 +/* control total # of nats */ +#define DEF_NAT_CACHE_THRESHOLD 100000 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -117,6 +123,17 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, raw_ne->version = ni->version; } +static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * + NM_I(sbi)->dirty_nats_ratio / 100; +} + +static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; +} + enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ @@ -183,7 +200,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) block_addr = (pgoff_t)(nm_i->nat_blkaddr + (seg_off << sbi->log_blocks_per_seg << 1) + - (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -212,6 +229,37 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) f2fs_change_bit(block_off, nm_i->nat_bitmap); } +static inline nid_t ino_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; +} + +static inline __u64 cpver_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.next_blkaddr); +} + static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { @@ -242,40 +290,30 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); - rn->footer.cp_ver = ckpt->checkpoint_ver; + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } -static inline nid_t ino_of_node(struct page *node_page) +static inline bool is_recoverable_dnode(struct page *page) { - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.ino); -} + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = cur_cp_version(ckpt); -static inline nid_t nid_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.nid); -} - -static inline unsigned int ofs_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - unsigned flag = le32_to_cpu(rn->footer.flag); - return flag >> OFFSET_BIT_SHIFT; -} - -static inline unsigned long long cpver_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le64_to_cpu(rn->footer.cp_ver); -} - -static inline block_t next_blkaddr_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.next_blkaddr); + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + return cpu_to_le64(cp_ver) == cpver_of_node(page); } /* @@ -317,17 +355,17 @@ static inline bool IS_DNODE(struct page *node_page) return true; } -static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); - f2fs_wait_on_page_writeback(p, NODE); + f2fs_wait_on_page_writeback(p, NODE, true); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); - set_page_dirty(p); + return set_page_dirty(p); } static inline nid_t get_nid(struct page *p, int off, bool i) @@ -370,6 +408,21 @@ static inline int is_node(struct page *page, int type) #define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) #define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) +static inline int is_inline_node(struct page *page) +{ + return PageChecked(page); +} + +static inline void set_inline_node(struct page *page) +{ + SetPageChecked(page); +} + +static inline void clear_inline_node(struct page *page) +{ + ClearPageChecked(page); +} + static inline void set_cold_node(struct inode *inode, struct page *page) { struct f2fs_node *rn = F2FS_NODE(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index cbf74f47cce8..2fc84a991325 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab; bool space_for_roll_forward(struct f2fs_sb_info *sbi) { - if (sbi->last_valid_block_count + sbi->alloc_valid_block_count - > sbi->user_block_count) + s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); + + if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; return true; } @@ -67,42 +68,71 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } -static int recover_dentry(struct inode *inode, struct page *ipage) +static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, + struct list_head *head, nid_t ino) +{ + struct inode *inode; + struct fsync_inode_entry *entry; + + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + entry->inode = inode; + list_add_tail(&entry->list, head); + + return entry; +} + +static void del_fsync_inode(struct fsync_inode_entry *entry) +{ + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); +} + +static int recover_dentry(struct inode *inode, struct page *ipage, + struct list_head *dir_list) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; - struct qstr name; + struct fscrypt_name fname; struct page *page; struct inode *dir, *einode; + struct fsync_inode_entry *entry; int err = 0; + char *name; - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; + entry = get_fsync_inode(dir_list, pino); + if (!entry) { + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + if (IS_ERR(entry)) { + dir = ERR_CAST(entry); + err = PTR_ERR(entry); + goto out; + } } - if (file_enc_name(inode)) { - iput(dir); - return 0; - } + dir = entry->inode; - name.len = le32_to_cpu(raw_inode->i_namelen); - name.name = raw_inode->i_name; + memset(&fname, 0, sizeof(struct fscrypt_name)); + fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen); + fname.disk_name.name = raw_inode->i_name; - if (unlikely(name.len > F2FS_NAME_LEN)) { + if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) { WARN_ON(1); err = -ENAMETOOLONG; - goto out_err; + goto out; } retry: - de = f2fs_find_entry(dir, &name, &page); + de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) goto out_unmap_put; if (de) { - einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); err = PTR_ERR(einode); @@ -118,29 +148,27 @@ retry: f2fs_delete_entry(de, page, dir, einode); iput(einode); goto retry; - } - err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); - if (err) - goto out_err; - - if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { - iput(dir); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { - add_dirty_dir_inode(dir); - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); + err = __f2fs_do_add_link(dir, &fname, inode, + inode->i_ino, inode->i_mode); } - + if (err == -ENOMEM) + goto retry; goto out; out_unmap_put: f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); -out_err: - iput(dir); out: + if (file_enc_name(inode)) + name = ""; + else + name = raw_inode->i_name; f2fs_msg(inode->i_sb, KERN_NOTICE, "%s: ino = %x, name = %s, dir = %lx, err = %d", - __func__, ino_of_node(ipage), raw_inode->i_name, + __func__, ino_of_node(ipage), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } @@ -151,7 +179,7 @@ static void recover_inode(struct inode *inode, struct page *page) char *name; inode->i_mode = le16_to_cpu(raw->i_mode); - i_size_write(inode, le64_to_cpu(raw->i_size)); + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); @@ -168,9 +196,34 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } +static bool is_same_inode(struct inode *inode, struct page *ipage) +{ + struct f2fs_inode *ri = F2FS_INODE(ipage); + struct timespec disk; + + if (!IS_INODE(ipage)) + return true; + + disk.tv_sec = le64_to_cpu(ri->i_ctime); + disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + if (timespec_compare(&inode->i_ctime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_atime); + disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + if (timespec_compare(&inode->i_atime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_mtime); + disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + if (timespec_compare(&inode->i_mtime, &disk) > 0) + return false; + + return true; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page = NULL; block_t blkaddr; @@ -180,8 +233,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR, true); - while (1) { struct fsync_inode_entry *entry; @@ -190,49 +241,41 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) + if (!is_recoverable_dnode(page)) break; if (!is_fsync_dnode(page)) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (!entry) { + if (entry) { + if (!is_same_inode(entry->inode, page)) + goto next; + } else { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; } - /* add this fsync inode to the list */ - entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); - if (!entry) { - err = -ENOMEM; - break; - } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); - if (IS_ERR(entry->inode)) { - err = PTR_ERR(entry->inode); - kmem_cache_free(fsync_entry_slab, entry); + entry = add_fsync_inode(sbi, head, ino_of_node(page)); + if (IS_ERR(entry)) { + err = PTR_ERR(entry); if (err == -ENOENT) { err = 0; goto next; } break; } - list_add_tail(&entry->list, head); } entry->blkaddr = blkaddr; - if (IS_INODE(page)) { - entry->last_inode = blkaddr; - if (is_dent_dnode(page)) - entry->last_dentry = blkaddr; - } + if (IS_INODE(page) && is_dent_dnode(page)) + entry->last_dentry = blkaddr; next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -248,11 +291,8 @@ static void destroy_fsync_dnodes(struct list_head *head) { struct fsync_inode_entry *entry, *tmp; - list_for_each_entry_safe(entry, tmp, head, list) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + list_for_each_entry_safe(entry, tmp, head, list) + del_fsync_inode(entry); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -314,15 +354,14 @@ got_it: if (ino != dn->inode->i_ino) { /* Deallocate previous index in the node page */ - inode = f2fs_iget(sbi->sb, ino); + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); } else { inode = dn->inode; } - bidx = start_bidx_of_node(offset, F2FS_I(inode)) + - le16_to_cpu(sum.ofs_in_node); + bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node); /* * if inode page is locked, unlock temporarily, but its reference @@ -357,10 +396,9 @@ truncate_out: static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, block_t blkaddr) { - struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int start, end; struct dnode_of_data dn; struct node_info ni; + unsigned int start, end; int err = 0, recovered = 0; /* step 1: recover xattr */ @@ -380,16 +418,21 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto out; /* step 3: recover data indices */ - start = start_bidx_of_node(ofs_of_node(page), fi); - end = start + ADDRS_PER_PAGE(page, fi); + start = start_bidx_of_node(ofs_of_node(page), inode); + end = start + ADDRS_PER_PAGE(page, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); - +retry_dn: err = get_dnode_of_data(&dn, start, ALLOC_NODE); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_dn; + } goto out; + } - f2fs_wait_on_page_writeback(dn.node_page, NODE); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true); get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); @@ -411,14 +454,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, continue; } + if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + /* * dest is reserved block, invalidate src block * and then reserve one new block in dnode page. */ if (dest == NEW_ADDR) { truncate_data_blocks_range(&dn, 1); - err = reserve_new_block(&dn); - f2fs_bug_on(sbi, err); + reserve_new_block(&dn); continue; } @@ -427,25 +472,33 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src == NULL_ADDR) { err = reserve_new_block(&dn); +#ifdef CONFIG_F2FS_FAULT_INJECTION + while (err) + err = reserve_new_block(&dn); +#endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); + if (err) + goto err; } - +retry_prev: /* Check the previous node page having this index */ err = check_index_in_prev_nodes(sbi, dest, &dn); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_prev; + } goto err; + } /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, - ni.version, false); + ni.version, false, false); recovered++; } } - if (IS_INODE(dn.node_page)) - sync_inode_page(&dn); - copy_node_footer(dn.node_page, page); fill_node_footer(dn.node_page, dn.nid, ni.ino, ofs_of_node(page), false); @@ -459,17 +512,16 @@ out: return err; } -static int recover_data(struct f2fs_sb_info *sbi, - struct list_head *head, int type) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + struct list_head *dir_list) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page = NULL; int err = 0; block_t blkaddr; /* get node pages in the current segment */ - curseg = CURSEG_I(sbi, type); + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); while (1) { @@ -482,12 +534,12 @@ static int recover_data(struct f2fs_sb_info *sbi, page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) { + if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); break; } - entry = get_fsync_inode(head, ino_of_node(page)); + entry = get_fsync_inode(inode_list, ino_of_node(page)); if (!entry) goto next; /* @@ -495,10 +547,10 @@ static int recover_data(struct f2fs_sb_info *sbi, * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (entry->last_inode == blkaddr) + if (IS_INODE(page)) recover_inode(entry->inode, page); if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, page); + err = recover_dentry(entry->inode, page, dir_list); if (err) { f2fs_put_page(page, 1); break; @@ -510,11 +562,8 @@ static int recover_data(struct f2fs_sb_info *sbi, break; } - if (entry->blkaddr == blkaddr) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + if (entry->blkaddr == blkaddr) + del_fsync_inode(entry); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -525,12 +574,14 @@ next: return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + struct list_head dir_list; block_t blkaddr; int err; + int ret = 0; bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -539,6 +590,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) return -ENOMEM; INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); @@ -547,25 +599,26 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); - if (err) + if (err || list_empty(&inode_list)) goto out; - if (list_empty(&inode_list)) + if (check_only) { + ret = 1; goto out; + } need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); - kmem_cache_destroy(fsync_entry_slab); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), - (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1); if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); @@ -573,31 +626,20 @@ out: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) { - bool invalidate = false; + if (err) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + mutex_unlock(&sbi->cp_mutex); - if (discard_next_dnode(sbi, blkaddr)) - invalidate = true; + /* let's drop all the directory inodes for clean checkpoint */ + destroy_fsync_dnodes(&dir_list); - /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) - sync_meta_pages(sbi, META, LONG_MAX); - - /* invalidate temporary meta page */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), - blkaddr, blkaddr); - - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - mutex_unlock(&sbi->cp_mutex); - } else if (need_writecp) { + if (!err && need_writecp) { struct cp_control cpc = { .reason = CP_RECOVERY, }; - mutex_unlock(&sbi->cp_mutex); - write_checkpoint(sbi, &cpc); - } else { - mutex_unlock(&sbi->cp_mutex); + err = write_checkpoint(sbi, &cpc); } - return err; + + kmem_cache_destroy(fsync_entry_slab); + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7965957dd0e6..b3c61ae37f92 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -86,6 +86,7 @@ static inline unsigned long __reverse_ffs(unsigned long word) /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 @@ -95,94 +96,73 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - tmp = __reverse_ulong((unsigned char *)p); - tmp &= ~0UL >> offset; + while (1) { + if (*p == 0) + goto pass; - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG-1)) { tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp &= (~0UL << (BITS_PER_LONG - size)); - if (!tmp) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffs(tmp); + return result; +found: + return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - tmp = __reverse_ulong((unsigned char *)p); - tmp |= ~((~0UL << offset) >> offset); + while (1) { + if (*p == ~0UL) + goto pass; - if (size < BITS_PER_LONG) - goto found_first; - if (tmp != ~0UL) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG - 1)) { tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; if (tmp != ~0UL) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp |= ~(~0UL << (BITS_PER_LONG - size)); - if (tmp == ~0UL) /* Are any bits zero? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffz(tmp); + return result; +found: + return result - size + __reverse_ffz(tmp); } void register_inmem_page(struct inode *inode, struct page *page) @@ -211,69 +191,149 @@ void register_inmem_page(struct inode *inode, struct page *page) trace_f2fs_register_inmem_page(page, INMEM); } -int commit_inmem_pages(struct inode *inode, bool abort) +static int __revoke_inmem_pages(struct inode *inode, + struct list_head *head, bool drop, bool recover) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inmem_pages *cur, *tmp; + int err = 0; + + list_for_each_entry_safe(cur, tmp, head, list) { + struct page *page = cur->page; + + if (drop) + trace_f2fs_commit_inmem_page(page, INMEM_DROP); + + lock_page(page); + + if (recover) { + struct dnode_of_data dn; + struct node_info ni; + + trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = -EAGAIN; + goto next; + } + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + cur->old_addr, ni.version, true, true); + f2fs_put_dnode(&dn); + } +next: + /* we don't need to invalidate this in the sccessful status */ + if (drop || recover) + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 1); + + list_del(&cur->list); + kmem_cache_free(inmem_entry_slab, cur); + dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + } + return err; +} + +void drop_inmem_pages(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + clear_inode_flag(inode, FI_ATOMIC_FILE); + + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + mutex_unlock(&fi->inmem_lock); +} + +static int __commit_inmem_pages(struct inode *inode, + struct list_head *revoke_list) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *cur, *tmp; - bool submit_bio = false; struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, .encrypted_page = NULL, }; + bool submit_bio = false; int err = 0; - /* - * The abort is true only when f2fs_evict_inode is called. - * Basically, the f2fs_evict_inode doesn't produce any data writes, so - * that we don't need to call f2fs_balance_fs. - * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this - * inode becomes free by iget_locked in f2fs_iget. - */ - if (!abort) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + struct page *page = cur->page; + + lock_page(page); + if (page->mapping == inode->i_mapping) { + trace_f2fs_commit_inmem_page(page, INMEM); + + set_page_dirty(page); + f2fs_wait_on_page_writeback(page, DATA, true); + if (clear_page_dirty_for_io(page)) + inode_dec_dirty_pages(inode); + + fio.page = page; + err = do_write_data_page(&fio); + if (err) { + unlock_page(page); + break; + } + + /* record old blkaddr for revoking */ + cur->old_addr = fio.old_blkaddr; + + clear_cold_data(page); + submit_bio = true; + } + unlock_page(page); + list_move_tail(&cur->list, revoke_list); } - mutex_lock(&fi->inmem_lock); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - lock_page(cur->page); - if (!abort) { - if (cur->page->mapping == inode->i_mapping) { - set_page_dirty(cur->page); - f2fs_wait_on_page_writeback(cur->page, DATA); - if (clear_page_dirty_for_io(cur->page)) - inode_dec_dirty_pages(inode); - trace_f2fs_commit_inmem_page(cur->page, INMEM); - fio.page = cur->page; - err = do_write_data_page(&fio); - if (err) { - unlock_page(cur->page); - break; - } - clear_cold_data(cur->page); - submit_bio = true; - } - } else { - trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); - } - set_page_private(cur->page, 0); - ClearPagePrivate(cur->page); - f2fs_put_page(cur->page, 1); + if (submit_bio) + f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); - list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + if (!err) + __revoke_inmem_pages(inode, revoke_list, false, false); + + return err; +} + +int commit_inmem_pages(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct list_head revoke_list; + int err; + + INIT_LIST_HEAD(&revoke_list); + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + mutex_lock(&fi->inmem_lock); + err = __commit_inmem_pages(inode, &revoke_list); + if (err) { + int ret; + /* + * try to revoke all committed pages, but still we could fail + * due to no memory or other reason, if that happened, EAGAIN + * will be returned, which means in such case, transaction is + * already not integrity, caller should use journal to do the + * recovery or rewrite & commit last transaction. For other + * error number, revoking was done by filesystem itself. + */ + ret = __revoke_inmem_pages(inode, &revoke_list, false, true); + if (ret) + err = ret; + + /* drop all uncommitted pages */ + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); } mutex_unlock(&fi->inmem_lock); - if (!abort) { - f2fs_unlock_op(sbi); - if (submit_bio) - f2fs_submit_merged_bio(sbi, DATA, WRITE); - } + f2fs_unlock_op(sbi); return err; } @@ -281,13 +341,25 @@ int commit_inmem_pages(struct inode *inode, bool abort) * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ -void f2fs_balance_fs(struct f2fs_sb_info *sbi) +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + + if (!need) + return; + + /* balance_fs_bg is able to be pending */ + if (excess_cached_nats(sbi)) + f2fs_balance_fs_bg(sbi); + /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi, 0)) { + if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); f2fs_gc(sbi, false); } @@ -304,14 +376,26 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); if (!available_free_memory(sbi, FREE_NIDS)) - try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES); + try_to_free_nids(sbi, MAX_FREE_NIDS); + else + build_free_nids(sbi); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || - excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES) || - jiffies > sbi->cp_expires) + excess_prefree_segs(sbi) || + excess_dirty_nats(sbi) || + (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + if (test_opt(sbi, DATA_FLUSH)) { + struct blk_plug plug; + + blk_start_plug(&plug); + sync_dirty_inodes(sbi, FILE_INODE); + blk_finish_plug(&plug); + } f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); + } } static int issue_flush_thread(void *data) @@ -361,24 +445,28 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) { + if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { struct bio *bio = f2fs_bio_alloc(0); int ret; + atomic_inc(&fcc->submit_flush); bio->bi_bdev = sbi->sb->s_bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); + atomic_dec(&fcc->submit_flush); bio_put(bio); return ret; } init_completion(&cmd.wait); + atomic_inc(&fcc->submit_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); return cmd.ret; } @@ -392,6 +480,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; + atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; @@ -513,28 +602,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) -{ - int err = -ENOTSUPP; - - if (test_opt(sbi, DISCARD)) { - struct seg_entry *se = get_seg_entry(sbi, - GET_SEGNO(sbi, blkaddr)); - unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - - if (f2fs_test_bit(offset, se->discard_map)) - return false; - - err = f2fs_issue_discard(sbi, blkaddr, 1); - } - - if (err) { - update_meta_page(sbi, NULL, blkaddr); - return true; - } - return false; -} - static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) @@ -573,7 +640,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) bool force = (cpc->reason == CP_DISCARD); int i; - if (se->valid_blocks == max_blocks) + if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) return; if (!force) { @@ -593,6 +660,10 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (force && start && end != max_blocks + && (end - start) < cpc->trim_minlen) + continue; + __add_discard_entry(sbi, cpc, se, start, end); } } @@ -630,6 +701,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; + unsigned int secno, start_segno; + bool force = (cpc->reason == CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -646,17 +719,31 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (!test_opt(sbi, DISCARD)) + if (force || !test_opt(sbi, DISCARD)) continue; - f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); + continue; + } +next: + secno = GET_SECNO(sbi, start); + start_segno = secno * sbi->segs_per_sec; + if (!IS_CURSEC(sbi, secno) && + !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), + sbi->segs_per_sec << sbi->log_blocks_per_seg); + + start = start_segno + sbi->segs_per_sec; + if (start < end) + goto next; } mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen) + if (force && entry->len < cpc->trim_minlen) goto skip; f2fs_issue_discard(sbi, entry->blkaddr, entry->len); cpc->trimmed += entry->len; @@ -711,12 +798,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (del > 0) { if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -817,12 +906,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) } } - sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - + sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= - (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -841,9 +930,9 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) void *dst = page_address(page); if (src) - memcpy(dst, src, PAGE_CACHE_SIZE); + memcpy(dst, src, PAGE_SIZE); else - memset(dst, 0, PAGE_CACHE_SIZE); + memset(dst, 0, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } @@ -854,6 +943,31 @@ static void write_sum_page(struct f2fs_sb_info *sbi, update_meta_page(sbi, (void *)sum_blk, blk_addr); } +static void write_current_sum_page(struct f2fs_sb_info *sbi, + int type, block_t blk_addr) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + struct page *page = grab_meta_page(sbi, blk_addr); + struct f2fs_summary_block *src = curseg->sum_blk; + struct f2fs_summary_block *dst; + + dst = (struct f2fs_summary_block *)page_address(page); + + mutex_lock(&curseg->curseg_mutex); + + down_read(&curseg->journal_rwsem); + memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); + up_read(&curseg->journal_rwsem); + + memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); + memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); + + mutex_unlock(&curseg->curseg_mutex); + + set_page_dirty(page); + f2fs_put_page(page, 1); +} + static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -886,9 +1000,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - MAIN_SEGS(sbi), *newseg + 1); - if (segno - *newseg < sbi->segs_per_sec - - (*newseg % sbi->segs_per_sec)) + (hint + 1) * sbi->segs_per_sec, *newseg + 1); + if (segno < (hint + 1) * sbi->segs_per_sec) goto got_it; } find_other_zone: @@ -1071,7 +1184,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) + if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) return v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR); @@ -1120,6 +1233,9 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + if (test_opt(sbi, LFS)) + return; + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segments(sbi, i); } @@ -1134,6 +1250,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; struct cp_control cpc; + int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -1142,6 +1259,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (end <= MAIN_BLKADDR(sbi)) goto out; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Found FS corruption, run fsck to fix."); + goto out; + } + /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : @@ -1164,12 +1287,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) sbi->segs_per_sec) - 1, end_segno); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); + if (err) + break; + + schedule(); } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); - return 0; + return err; } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -1256,7 +1383,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, /* direct_io'ed data is aligned to the segment for better performance */ if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0)) + !has_not_enough_free_secs(sbi, 0, 0)) __allocate_new_segments(sbi, type); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -1292,11 +1419,17 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); - allocate_data_block(fio->sbi, fio->page, fio->blk_addr, - &fio->blk_addr, sum, type); + if (fio->type == NODE || fio->type == DATA) + mutex_lock(&fio->sbi->wio_mutex[fio->type]); + + allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ f2fs_submit_page_mbio(fio); + + if (fio->type == NODE || fio->type == DATA) + mutex_unlock(&fio->sbi->wio_mutex[fio->type]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1305,7 +1438,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .sbi = sbi, .type = META, .rw = WRITE_SYNC | REQ_META | REQ_PRIO, - .blk_addr = page->index, + .old_blkaddr = page->index, + .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, }; @@ -1335,19 +1469,19 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); - dn->data_blkaddr = fio->blk_addr; + f2fs_update_data_blkaddr(dn, fio->new_blkaddr); } void rewrite_data_page(struct f2fs_io_info *fio) { + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); f2fs_submit_page_mbio(fio); } -static void __f2fs_replace_block(struct f2fs_sb_info *sbi, - struct f2fs_summary *sum, +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg) + bool recover_curseg, bool recover_newaddr) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; @@ -1390,7 +1524,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - if (!recover_curseg) + if (!recover_curseg || recover_newaddr) update_sit_entry(sbi, new_blkaddr, 1); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) update_sit_entry(sbi, old_blkaddr, -1); @@ -1414,66 +1548,30 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, - unsigned char version, bool recover_curseg) + unsigned char version, bool recover_curseg, + bool recover_newaddr) { struct f2fs_summary sum; set_summary(&sum, dn->nid, dn->ofs_in_node, version); - __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg); + __f2fs_replace_block(sbi, &sum, old_addr, new_addr, + recover_curseg, recover_newaddr); - dn->data_blkaddr = new_addr; - set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); -} - -static inline bool is_merged_page(struct f2fs_sb_info *sbi, - struct page *page, enum page_type type) -{ - enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - struct bio_vec *bvec; - struct page *target; - int i; - - down_read(&io->io_rwsem); - if (!io->bio) { - up_read(&io->io_rwsem); - return false; - } - - bio_for_each_segment_all(bvec, io->bio, i) { - - if (bvec->bv_page->mapping) { - target = bvec->bv_page; - } else { - struct f2fs_crypto_ctx *ctx; - - /* encrypted page */ - ctx = (struct f2fs_crypto_ctx *)page_private( - bvec->bv_page); - target = ctx->w.control_page; - } - - if (page == target) { - up_read(&io->io_rwsem); - return true; - } - } - - up_read(&io->io_rwsem); - return false; + f2fs_update_data_blkaddr(dn, new_addr); } void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type) + enum page_type type, bool ordered) { if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - if (is_merged_page(sbi, page, type)) - f2fs_submit_merged_bio(sbi, type, WRITE); - wait_on_page_writeback(page); + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + if (ordered) + wait_on_page_writeback(page); + else + wait_for_stable_page(page); } } @@ -1482,14 +1580,12 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, { struct page *cpage; - if (blkaddr == NEW_ADDR) + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return; - f2fs_bug_on(sbi, blkaddr == NULL_ADDR); - cpage = find_lock_page(META_MAPPING(sbi), blkaddr); if (cpage) { - f2fs_wait_on_page_writeback(cpage, DATA); + f2fs_wait_on_page_writeback(cpage, DATA, true); f2fs_put_page(cpage, 1); } } @@ -1510,12 +1606,11 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE, - SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); offset = 2 * SUM_JOURNAL_SIZE; /* Step 3: restore summary entries */ @@ -1539,7 +1634,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; - if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (offset + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; @@ -1611,7 +1706,14 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) /* set uncompleted segment to curseg */ curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE); + + /* update journal info */ + down_write(&curseg->journal_rwsem); + memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); + up_write(&curseg->journal_rwsem); + + memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); + memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; @@ -1623,14 +1725,10 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { - struct f2fs_summary_block *s_sits = - CURSEG_I(sbi, CURSEG_COLD_DATA)->sum_blk; - struct f2fs_summary_block *s_nats = - CURSEG_I(sbi, CURSEG_HOT_DATA)->sum_blk; int type = CURSEG_HOT_DATA; int err; - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { int npages = npages_for_summary_flush(sbi, true); if (npages >= 2) @@ -1653,11 +1751,6 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } - /* sanity check for summary blocks */ - if (nats_in_cursum(s_nats) > NAT_JOURNAL_ENTRIES || - sits_in_cursum(s_sits) > SIT_JOURNAL_ENTRIES) - return -EINVAL; - return 0; } @@ -1675,13 +1768,12 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE); + memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits, - SUM_JOURNAL_SIZE); + memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 3: write summary entries */ @@ -1703,7 +1795,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; - if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (written_size + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; @@ -1727,17 +1819,13 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, else end = type + NR_CURSEG_NODE_TYPE; - for (i = type; i < end; i++) { - struct curseg_info *sum = CURSEG_I(sbi, i); - mutex_lock(&sum->curseg_mutex); - write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type)); - mutex_unlock(&sum->curseg_mutex); - } + for (i = type; i < end; i++) + write_current_sum_page(sbi, i, blkaddr + (i - type)); } void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); @@ -1748,24 +1836,24 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } -int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; if (type == NAT_JOURNAL) { - for (i = 0; i < nats_in_cursum(sum); i++) { - if (le32_to_cpu(nid_in_journal(sum, i)) == val) + for (i = 0; i < nats_in_cursum(journal); i++) { + if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } - if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) - return update_nats_in_cursum(sum, 1); + if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) + return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { - for (i = 0; i < sits_in_cursum(sum); i++) - if (le32_to_cpu(segno_in_journal(sum, i)) == val) + for (i = 0; i < sits_in_cursum(journal); i++) + if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; - if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) - return update_sits_in_cursum(sum, 1); + if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) + return update_sits_in_cursum(journal, 1); } return -1; } @@ -1794,7 +1882,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, src_addr = page_address(src_page); dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + memcpy(dst_addr, src_addr, PAGE_SIZE); set_page_dirty(dst_page); f2fs_put_page(src_page, 1); @@ -1869,20 +1957,22 @@ static void add_sits_in_set(struct f2fs_sb_info *sbi) static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i; - for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { + down_write(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int segno; bool dirtied; - segno = le32_to_cpu(segno_in_journal(sum, i)); + segno = le32_to_cpu(segno_in_journal(journal, i)); dirtied = __mark_sit_entry_dirty(sbi, segno); if (!dirtied) add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } - update_sits_in_cursum(sum, -sits_in_cursum(sum)); + update_sits_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); } /* @@ -1894,13 +1984,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; struct sit_entry_set *ses, *tmp; struct list_head *head = &SM_I(sbi)->sit_entry_set; bool to_journal = true; struct seg_entry *se; - mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) @@ -1917,7 +2006,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and add and account * them in sit entry set. */ - if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) + if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL)) remove_sits_in_journal(sbi); /* @@ -1934,10 +2023,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned int segno = start_segno; if (to_journal && - !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL)) + !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) to_journal = false; - if (!to_journal) { + if (to_journal) { + down_write(&curseg->journal_rwsem); + } else { page = get_next_sit_page(sbi, start_segno); raw_sit = page_address(page); } @@ -1955,13 +2046,13 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (to_journal) { - offset = lookup_journal_in_cursum(sum, + offset = lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); - segno_in_journal(sum, offset) = + segno_in_journal(journal, offset) = cpu_to_le32(segno); seg_info_to_raw_sit(se, - &sit_in_journal(sum, offset)); + &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, @@ -1973,7 +2064,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) ses->entry_cnt--; } - if (!to_journal) + if (to_journal) + up_write(&curseg->journal_rwsem); + else f2fs_put_page(page, 1); f2fs_bug_on(sbi, ses->entry_cnt); @@ -1988,7 +2081,6 @@ out: add_discard_addrs(sbi, cpc); } mutex_unlock(&sit_i->sentry_lock); - mutex_unlock(&curseg->curseg_mutex); set_prefree_as_free_segments(sbi); } @@ -2024,12 +2116,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi) = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - sit_i->sentries[start].discard_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map || - !sit_i->sentries[start].ckpt_valid_map || - !sit_i->sentries[start].discard_map) + !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; + + if (f2fs_discard_en(sbi)) { + sit_i->sentries[start].discard_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; + } } sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2117,9 +2213,14 @@ static int build_curseg(struct f2fs_sb_info *sbi) for (i = 0; i < NR_CURSEG_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; + init_rwsem(&array[i].journal_rwsem); + array[i].journal = kzalloc(sizeof(struct f2fs_journal), + GFP_KERNEL); + if (!array[i].journal) + return -ENOMEM; array[i].segno = NULL_SEGNO; array[i].next_blkoff = 0; } @@ -2130,11 +2231,13 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; + struct seg_entry *se; + struct f2fs_sit_entry sit; int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi); + int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); @@ -2143,41 +2246,58 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) end = (start_blk + readed) * sit_i->sents_per_block; for (; start < end && start < MAIN_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; struct page *page; - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < sits_in_cursum(sum); i++) { - if (le32_to_cpu(segno_in_journal(sum, i)) - == start) { - sit = sit_in_journal(sum, i); - mutex_unlock(&curseg->curseg_mutex); - goto got_it; - } - } - mutex_unlock(&curseg->curseg_mutex); - + se = &sit_i->sentries[start]; page = get_current_sit_page(sbi, start); sit_blk = (struct f2fs_sit_block *)page_address(page); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); -got_it: + check_block_count(sbi, start, &sit); seg_info_from_raw_sit(se, &sit); /* build discard map only one time */ - memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks; - - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += sbi->blocks_per_seg - + se->valid_blocks; } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; } start_blk += readed; } while (start_blk < sit_blk_cnt); + + down_read(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int old_valid_blocks; + + start = le32_to_cpu(segno_in_journal(journal, i)); + se = &sit_i->sentries[start]; + sit = sit_in_journal(journal, i); + + old_valid_blocks = se->valid_blocks; + + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks - old_valid_blocks; + } + up_read(&curseg->journal_rwsem); } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -2310,7 +2430,11 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; - sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) + sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; + + if (!test_opt(sbi, LFS)) + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; @@ -2392,8 +2516,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) if (!array) return; SM_I(sbi)->curseg_array = NULL; - for (i = 0; i < NR_CURSEG_TYPE; i++) + for (i = 0; i < NR_CURSEG_TYPE; i++) { kfree(array[i].sum_blk); + kfree(array[i].journal); + } kfree(array); } @@ -2459,7 +2585,7 @@ int __init create_segment_manager_caches(void) sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destory_discard_entry; + goto destroy_discard_entry; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2469,7 +2595,7 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destory_discard_entry: +destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: return -ENOMEM; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ee44d346ea44..fecb856ad874 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -16,6 +16,7 @@ #define NULL_SECNO ((unsigned int)(~0)) #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ +#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) @@ -158,16 +159,17 @@ struct victim_sel_policy { }; struct seg_entry { - unsigned short valid_blocks; /* # of valid blocks */ + unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ + unsigned int valid_blocks:10; /* # of valid blocks */ + unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ + unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. */ - unsigned short ckpt_valid_blocks; - unsigned char *ckpt_valid_map; + unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ unsigned char *discard_map; - unsigned char type; /* segment type like CURSEG_XXX_TYPE */ unsigned long long mtime; /* modification time of the segment */ }; @@ -183,7 +185,7 @@ struct segment_allocation { * this value is set in page as a private data which indicate that * the page is atomically written, and it is in inmem_pages list. */ -#define ATOMIC_WRITTEN_PAGE 0x0000ffff +#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) @@ -191,6 +193,7 @@ struct segment_allocation { struct inmem_pages { struct list_head list; struct page *page; + block_t old_addr; /* for revoking when fail to commit */ }; struct sit_info { @@ -257,6 +260,8 @@ struct victim_selection { struct curseg_info { struct mutex curseg_mutex; /* lock for consistency */ struct f2fs_summary_block *sum_blk; /* cached summary block */ + struct rw_semaphore journal_rwsem; /* protect journal area */ + struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ @@ -466,20 +471,27 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + + if (test_opt(sbi, LFS)) + return false; + return free_sections(sbi) <= (node_secs + 2 * dent_secs + reserved_sections(sbi) + 1); } -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi)); + return (free_sections(sbi) + freed) <= + (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) @@ -527,6 +539,9 @@ static inline bool need_inplace_update(struct inode *inode) if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; + if (test_opt(sbi, LFS)) + return false; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -540,7 +555,7 @@ static inline bool need_inplace_update(struct inode *inode) /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + is_inode_flag_set(inode, FI_NEED_IPU)) return true; return false; @@ -573,8 +588,8 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi) - || blk_addr >= MAX_BLKADDR(sbi)); + BUG_ON(blk_addr < SEG0_BLKADDR(sbi) + || blk_addr >= MAX_BLKADDR(sbi)); } /* @@ -702,9 +717,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) if (type == DATA) return sbi->blocks_per_seg; else if (type == NODE) - return 3 * sbi->blocks_per_seg; + return 8 * sbi->blocks_per_seg; else if (type == META) - return MAX_BIO_BLOCKS(sbi); + return 8 * MAX_BIO_BLOCKS(sbi); else return 0; } @@ -722,10 +737,8 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, nr_to_write = wbc->nr_to_write; - if (type == DATA) - desired = 4096; - else if (type == NODE) - desired = 3 * max_hw_blocks(sbi); + if (type == NODE) + desired = 2 * max_hw_blocks(sbi); else desired = MAX_BIO_BLOCKS(sbi); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index da0d8e0b55a5..46c915425923 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -13,6 +13,7 @@ #include #include "f2fs.h" +#include "node.h" static LIST_HEAD(f2fs_list); static DEFINE_SPINLOCK(f2fs_list_lock); @@ -25,14 +26,15 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK) - return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK; + if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) + return NM_I(sbi)->fcnt - MAX_FREE_NIDS; return 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) { - return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); + return atomic_read(&sbi->total_zombie_tree) + + atomic_read(&sbi->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2ac3417d9412..fd249cc9b96e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -39,6 +39,35 @@ static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; +#ifdef CONFIG_F2FS_FAULT_INJECTION + +char *fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_IO] = "IO error", + [FAULT_CHECKPOINT] = "checkpoint error", +}; + +static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, + unsigned int rate) +{ + struct f2fs_fault_info *ffi = &sbi->fault_info; + + if (rate) { + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + ffi->inject_type = (1 << FAULT_MAX) - 1; + } else { + memset(ffi, 0, sizeof(struct f2fs_fault_info)); + } +} +#endif + /* f2fs-wide shrinker description */ static struct shrinker f2fs_shrinker_info = { .scan_objects = f2fs_shrink_scan, @@ -51,6 +80,7 @@ enum { Opt_disable_roll_forward, Opt_norecovery, Opt_discard, + Opt_nodiscard, Opt_noheap, Opt_user_xattr, Opt_nouser_xattr, @@ -61,12 +91,19 @@ enum { Opt_inline_xattr, Opt_inline_data, Opt_inline_dentry, + Opt_noinline_dentry, Opt_flush_merge, + Opt_noflush_merge, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, + Opt_data_flush, + Opt_mode, + Opt_fault_injection, + Opt_lazytime, + Opt_nolazytime, Opt_err, }; @@ -75,6 +112,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, @@ -85,12 +123,19 @@ static match_table_t f2fs_tokens = { {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, + {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, + {Opt_noflush_merge, "noflush_merge"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, + {Opt_data_flush, "data_flush"}, + {Opt_mode, "mode=%s"}, + {Opt_fault_injection, "fault_injection=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_err, NULL}, }; @@ -100,6 +145,10 @@ enum { SM_INFO, /* struct f2fs_sm_info */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif }; struct f2fs_attr { @@ -121,9 +170,27 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif return NULL; } +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -157,6 +224,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret < 0) return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif *ui = t; return count; } @@ -202,6 +273,9 @@ static struct f2fs_attr f2fs_attr_##_name = { \ f2fs_sbi_show, f2fs_sbi_store, \ offsetof(struct struct_name, elname)) +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); @@ -214,9 +288,16 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -234,7 +315,14 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), NULL, }; @@ -330,6 +418,8 @@ static int parse_options(struct super_block *sb, char *options) "the device does not support discard"); } break; + case Opt_nodiscard: + clear_opt(sbi, DISCARD); case Opt_noheap: set_opt(sbi, NOHEAP); break; @@ -388,9 +478,15 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_dentry: set_opt(sbi, INLINE_DENTRY); break; + case Opt_noinline_dentry: + clear_opt(sbi, INLINE_DENTRY); + break; case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; + case Opt_noflush_merge: + clear_opt(sbi, FLUSH_MERGE); + break; case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; @@ -406,6 +502,42 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; + case Opt_data_flush: + set_opt(sbi, DATA_FLUSH); + break; + case Opt_mode: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (strlen(name) == 8 && + !strncmp(name, "adaptive", 8)) { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } else if (strlen(name) == 3 && + !strncmp(name, "lfs", 3)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, arg); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; + case Opt_lazytime: + sb->s_flags |= MS_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~MS_LAZYTIME; + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -426,26 +558,25 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); + if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { + kmem_cache_free(f2fs_inode_cachep, fi); + return NULL; + } + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; - atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->dirty_list); + INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - - set_inode_flag(fi, FI_NEW_INODE); - - if (test_opt(F2FS_SB(sb), INLINE_XATTR)) - set_inode_flag(fi, FI_INLINE_XATTR); + init_rwsem(&fi->dio_rwsem[READ]); + init_rwsem(&fi->dio_rwsem[WRITE]); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - fi->i_crypt_info = NULL; -#endif return &fi->vfs_inode; } @@ -458,7 +589,7 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { + if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ atomic_inc(&inode->i_count); @@ -466,32 +597,66 @@ static int f2fs_drop_inode(struct inode *inode) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); sb_start_intwrite(inode->i_sb); - i_size_write(inode, 0); + f2fs_i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode, true); + f2fs_truncate(inode); sb_end_intwrite(inode->i_sb); -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (F2FS_I(inode)->i_crypt_info) - f2fs_free_encryption_info(inode, - F2FS_I(inode)->i_crypt_info); -#endif + fscrypt_put_encryption_info(inode, NULL); spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } return 0; } + return generic_drop_inode(inode); } +int f2fs_inode_dirtied(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 1; + } + + set_inode_flag(inode, FI_DIRTY_INODE); + list_add_tail(&F2FS_I(inode)->gdirty_list, + &sbi->inode_list[DIRTY_META]); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + stat_inc_dirty_inode(sbi, DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + return 0; +} + +void f2fs_inode_synced(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return; + } + list_del_init(&F2FS_I(inode)->gdirty_list); + clear_inode_flag(inode, FI_DIRTY_INODE); + clear_inode_flag(inode, FI_AUTO_RECOVER); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); +} + /* * f2fs_dirty_inode() is called from __mark_inode_dirty() * @@ -499,7 +664,19 @@ static int f2fs_drop_inode(struct inode *inode) */ static void f2fs_dirty_inode(struct inode *inode, int flags) { - set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return; + + if (flags == I_DIRTY_TIME) + return; + + if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) + clear_inode_flag(inode, FI_AUTO_RECOVER); + + f2fs_inode_dirtied(inode); } static void f2fs_i_callback(struct rcu_head *head) @@ -510,15 +687,27 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { + percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } +static void destroy_percpu_info(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_COUNT_TYPE; i++) + percpu_counter_destroy(&sbi->nr_pages[i]); + percpu_counter_destroy(&sbi->alloc_valid_block_count); + percpu_counter_destroy(&sbi->total_valid_inode_count); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); @@ -534,7 +723,7 @@ static void f2fs_put_super(struct super_block *sb) * clean checkpoint again. */ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || - !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { .reason = CP_UMOUNT, }; @@ -548,12 +737,15 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_dirty_inode(sbi); + release_ino_entry(sbi, true); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); + /* our cp_error case, we can wait for any writeback page */ + f2fs_flush_merged_bios(sbi); + iput(sbi->node_inode); iput(sbi->meta_inode); @@ -566,13 +758,18 @@ static void f2fs_put_super(struct super_block *sb) wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; - brelse(sbi->raw_super_buf); + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); + kfree(sbi->raw_super); + + destroy_percpu_info(sbi); kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; trace_f2fs_sync_fs(sb, sync); @@ -582,14 +779,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); - } else { - f2fs_balance_fs(sbi); } f2fs_trace_ios(NULL, 1); - return 0; + return err; } static int f2fs_freeze(struct super_block *sb) @@ -623,7 +818,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; - buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; + buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -676,6 +871,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noinline_data"); if (test_opt(sbi, INLINE_DENTRY)) seq_puts(seq, ",inline_dentry"); + else + seq_puts(seq, ",noinline_dentry"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); if (test_opt(sbi, NOBARRIER)) @@ -686,6 +883,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); + + seq_puts(seq, ",mode="); + if (test_opt(sbi, ADAPTIVE)) + seq_puts(seq, "adaptive"); + else if (test_opt(sbi, LFS)) + seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -718,19 +923,47 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } -static int segment_info_open_fs(struct inode *inode, struct file *file) +static int segment_bits_seq_show(struct seq_file *seq, void *offset) { - return single_open(file, segment_info_seq_show, PDE_DATA(inode)); + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, 1)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; } -static const struct file_operations f2fs_seq_segment_info_fops = { - .owner = THIS_MODULE, - .open = segment_info_open_fs, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .owner = THIS_MODULE, \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ }; +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); + static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ @@ -738,7 +971,16 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); + set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + sbi->sb->s_flags |= MS_LAZYTIME; + set_opt(sbi, FLUSH_MERGE); + if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + set_opt(sbi, DISCARD); + } else { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -746,6 +988,10 @@ static void default_options(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FS_POSIX_ACL set_opt(sbi, POSIX_ACL); #endif + +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, 0); +#endif } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -756,8 +1002,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); - - sync_filesystem(sb); +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info ffi = sbi->fault_info; +#endif /* * Save the old mount options in case we @@ -766,6 +1013,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; active_logs = sbi->active_logs; + /* recover superblocks we couldn't write due to previous RO mount */ + if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + err = f2fs_commit_super(sbi, false); + f2fs_msg(sb, KERN_INFO, + "Try to recover all the superblocks, ret: %d", err); + if (!err) + clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); + } + sbi->mount_opt.opt = 0; default_options(sbi); @@ -797,7 +1053,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { stop_gc_thread(sbi); - f2fs_sync_fs(sb, 1); need_restart_gc = true; } } else if (!sbi->gc_thread) { @@ -807,6 +1062,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } + if (*flags & MS_RDONLY) { + writeback_inodes_sb(sb, WB_REASON_SYNC); + sync_inodes_sb(sb); + + set_sbi_flag(sbi, SBI_IS_DIRTY); + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_sync_fs(sb, 1); + clear_sbi_flag(sbi, SBI_IS_CLOSE); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -820,8 +1085,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } skip: /* Update the POSIXACL Flag */ - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; restore_gc: if (need_restart_gc) { @@ -834,6 +1100,9 @@ restore_gc: restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; +#ifdef CONFIG_F2FS_FAULT_INJECTION + sbi->fault_info = ffi; +#endif return err; } @@ -853,6 +1122,48 @@ static struct super_operations f2fs_sops = { .remount_fs = f2fs_remount, }; +#ifdef CONFIG_F2FS_FS_ENCRYPTION +static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) +{ + return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, NULL); +} + +static int f2fs_key_prefix(struct inode *inode, u8 **key) +{ + *key = F2FS_I_SB(inode)->key_prefix; + return F2FS_I_SB(inode)->key_prefix_size; +} + +static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, fs_data, XATTR_CREATE); +} + +static unsigned f2fs_max_namelen(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? + inode->i_sb->s_blocksize : F2FS_NAME_LEN; +} + +static struct fscrypt_operations f2fs_cryptops = { + .get_context = f2fs_get_context, + .key_prefix = f2fs_key_prefix, + .set_context = f2fs_set_context, + .is_encrypted = f2fs_encrypted_inode, + .empty_dir = f2fs_empty_dir, + .max_namelen = f2fs_max_namelen, +}; +#else +static struct fscrypt_operations f2fs_cryptops = { + .is_encrypted = f2fs_encrypted_inode, +}; +#endif + static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -898,7 +1209,7 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_size(unsigned bits) +static loff_t max_file_blocks(void) { loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; @@ -914,13 +1225,29 @@ static loff_t max_file_size(unsigned bits) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; - result <<= bits; return result; } -static inline bool sanity_check_area_boundary(struct super_block *sb, - struct f2fs_super_block *raw_super) +static int __f2fs_commit_super(struct buffer_head *bh, + struct f2fs_super_block *super) { + lock_buffer(bh); + if (super) + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); +} + +static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); @@ -934,6 +1261,10 @@ static inline bool sanity_check_area_boundary(struct super_block *sb, u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); u32 segment_count = le32_to_cpu(raw_super->segment_count); u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + u64 main_end_blkaddr = main_blkaddr + + (segment_count_main << log_blocks_per_seg); + u64 seg_end_blkaddr = segment0_blkaddr + + (segment_count << log_blocks_per_seg); if (segment0_blkaddr != cp_blkaddr) { f2fs_msg(sb, KERN_INFO, @@ -978,22 +1309,47 @@ static inline bool sanity_check_area_boundary(struct super_block *sb, return true; } - if (main_blkaddr + (segment_count_main << log_blocks_per_seg) != - segment0_blkaddr + (segment_count << log_blocks_per_seg)) { + if (main_end_blkaddr > seg_end_blkaddr) { f2fs_msg(sb, KERN_INFO, - "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)", + "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)", main_blkaddr, - segment0_blkaddr + (segment_count << log_blocks_per_seg), + segment0_blkaddr + + (segment_count << log_blocks_per_seg), segment_count_main << log_blocks_per_seg); return true; - } + } else if (main_end_blkaddr < seg_end_blkaddr) { + int err = 0; + char *res; + /* fix in-memory information all the time */ + raw_super->segment_count = cpu_to_le32((main_end_blkaddr - + segment0_blkaddr) >> log_blocks_per_seg); + + if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + res = "internally"; + } else { + err = __f2fs_commit_super(bh, NULL); + res = err ? "failed" : "done"; + } + f2fs_msg(sb, KERN_INFO, + "Fix alignment : %s, start(%u) end(%u) block(%u)", + res, main_blkaddr, + segment0_blkaddr + + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + if (err) + return true; + } return false; } -static int sanity_check_raw_super(struct super_block *sb, - struct f2fs_super_block *raw_super) +static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + struct buffer_head *bh) { + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; unsigned int blocksize; if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { @@ -1004,10 +1360,10 @@ static int sanity_check_raw_super(struct super_block *sb, } /* Currently, support only 4KB page cache size */ - if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) { + if (F2FS_BLKSIZE != PAGE_SIZE) { f2fs_msg(sb, KERN_INFO, "Invalid page_cache_size (%lu), supports only 4KB\n", - PAGE_CACHE_SIZE); + PAGE_SIZE); return 1; } @@ -1059,27 +1415,18 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } - if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { - f2fs_msg(sb, KERN_INFO, - "Invalid segment count (%u)", - le32_to_cpu(raw_super->segment_count)); - return 1; - } - /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ - if (sanity_check_area_boundary(sb, raw_super)) + if (sanity_check_area_boundary(sbi, bh)) return 1; return 0; } -static int sanity_check_ckpt(struct f2fs_sb_info *sbi) +int sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned int main_segs, blocks_per_seg; - int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1091,22 +1438,6 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; - main_segs = le32_to_cpu(sbi->raw_super->segment_count_main); - blocks_per_seg = sbi->blocks_per_seg; - - for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { - if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || - le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) { - return 1; - } - } - for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { - if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || - le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) { - return 1; - } - } - if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; @@ -1117,7 +1448,6 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1137,111 +1467,131 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->cur_victim_sec = NULL_SECNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; - for (i = 0; i < NR_COUNT_TYPE; i++) - atomic_set(&sbi->nr_pages[i], 0); - sbi->dir_level = DEF_DIR_LEVEL; - sbi->cp_interval = DEF_CP_INTERVAL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); + mutex_init(&sbi->wio_mutex[NODE]); + mutex_init(&sbi->wio_mutex[DATA]); + spin_lock_init(&sbi->cp_lock); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, + F2FS_KEY_DESC_PREFIX_SIZE); + sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; +#endif +} + +static int init_percpu_info(struct f2fs_sb_info *sbi) +{ + int i, err; + + for (i = 0; i < NR_COUNT_TYPE; i++) { + err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); + if (err) + return err; + } + + err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); + if (err) + return err; + + return percpu_counter_init(&sbi->total_valid_inode_count, 0, + GFP_KERNEL); } /* * Read f2fs raw super block. - * Because we have two copies of super block, so read the first one at first, - * if the first one is invalid, move to read the second one. + * Because we have two copies of super block, so read both of them + * to get the first valid one. If any one of them is broken, we pass + * them recovery flag back to the caller. */ -static int read_raw_super_block(struct super_block *sb, +static int read_raw_super_block(struct f2fs_sb_info *sbi, struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, - int *recovery) + int *valid_super_block, int *recovery) { - int block = 0; - struct buffer_head *buffer; + struct super_block *sb = sbi->sb; + int block; + struct buffer_head *bh; struct f2fs_super_block *super; int err = 0; -retry: - buffer = sb_bread(sb, block); - if (!buffer) { - *recovery = 1; - f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; + + for (block = 0; block < 2; block++) { + bh = sb_bread(sb, block); + if (!bh) { + f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { err = -EIO; - goto out; + continue; } - } - super = (struct f2fs_super_block *) - ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET); - - /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, super)) { - brelse(buffer); - *recovery = 1; - f2fs_msg(sb, KERN_ERR, - "Can't find valid F2FS filesystem in %dth superblock", - block + 1); - if (block == 0) { - block++; - goto retry; - } else { + /* sanity checking of raw super */ + if (sanity_check_raw_super(sbi, bh)) { + f2fs_msg(sb, KERN_ERR, + "Can't find valid F2FS filesystem in %dth superblock", + block + 1); err = -EINVAL; - goto out; + brelse(bh); + continue; } + + if (!*raw_super) { + memcpy(super, bh->b_data + F2FS_SUPER_OFFSET, + sizeof(*super)); + *valid_super_block = block; + *raw_super = super; + } + brelse(bh); } - if (!*raw_super) { - *raw_super_buf = buffer; - *raw_super = super; - } else { - /* already have a valid superblock */ - brelse(buffer); - } + /* Fail to read any one of the superblocks*/ + if (err < 0) + *recovery = 1; - /* check the validity of the second superblock */ - if (block == 0) { - block++; - goto retry; - } - -out: /* No valid superblock */ if (!*raw_super) - return err; + kfree(super); + else + err = 0; - return 0; + return err; } int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { - struct buffer_head *sbh = sbi->raw_super_buf; - sector_t block = sbh->b_blocknr; + struct buffer_head *bh; int err; - /* write back-up superblock first */ - sbh->b_blocknr = block ? 0 : 1; - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); + if ((recover && f2fs_readonly(sbi->sb)) || + bdev_read_only(sbi->sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + return -EROFS; + } - sbh->b_blocknr = block; + /* write back-up superblock first */ + bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); /* if we are in recovery path, skip writing valid superblock */ if (recover || err) - goto out; + return err; /* write current valid superblock */ - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); -out: - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); + bh = sb_getblk(sbi->sb, sbi->valid_super_block); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); return err; } @@ -1249,17 +1599,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; - struct buffer_head *raw_super_buf; struct inode *root; - long err; + int err; bool retry = true, need_fsck = false; char *options = NULL; - int recovery, i; + int recovery, i, valid_super_block; + struct curseg_info *seg_i; try_onemore: err = -EINVAL; raw_super = NULL; - raw_super_buf = NULL; + valid_super_block = -1; recovery = 0; /* allocate memory for f2fs-specific super block info */ @@ -1267,17 +1617,31 @@ try_onemore: if (!sbi) return -ENOMEM; + sbi->sb = sb; + + /* Load the checksum driver */ + sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver."); + err = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto free_sbi; + } + /* set a block size */ if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery); + err = read_raw_super_block(sbi, &raw_super, &valid_super_block, + &recovery); if (err) goto free_sbi; sb->s_fs_info = sbi; + sbi->raw_super = raw_super; + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1290,11 +1654,14 @@ try_onemore: if (err) goto free_options; - sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); + sbi->max_file_blocks = max_file_blocks(); + sb->s_maxbytes = sbi->max_file_blocks << + le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); sb->s_op = &f2fs_sops; + sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; @@ -1304,11 +1671,8 @@ try_onemore: memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ - sbi->sb = sb; - sbi->raw_super = raw_super; - sbi->raw_super_buf = raw_super_buf; + sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); @@ -1329,6 +1693,10 @@ try_onemore: init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); + err = init_percpu_info(sbi); + if (err) + goto free_options; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { @@ -1343,24 +1711,19 @@ try_onemore: goto free_meta_inode; } - /* sanity checking of checkpoint */ - err = -EINVAL; - if (sanity_check_ckpt(sbi)) { - f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); - goto free_cp; - } - sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); - sbi->total_valid_inode_count = - le32_to_cpu(sbi->ckpt->valid_inode_count); + percpu_counter_set(&sbi->total_valid_inode_count, + le32_to_cpu(sbi->ckpt->valid_inode_count)); sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; - INIT_LIST_HEAD(&sbi->dir_inode_list); - spin_lock_init(&sbi->dir_inode_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } init_extent_cache_info(sbi); @@ -1380,6 +1743,17 @@ try_onemore: goto free_nm; } + /* For write statistics */ + if (sb->s_bdev->bd_part) + sbi->sectors_written_start = + (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]); + + /* Read accumulated write IO statistics if exists */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + if (__exist_node_summaries(sbi)) + sbi->kbytes_written = + le64_to_cpu(seg_i->journal->info.kbytes_written); + build_gc_manager(sbi); /* get an inode for node space */ @@ -1423,9 +1797,12 @@ try_onemore: if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - if (sbi->s_proc) + if (sbi->s_proc) { proc_create_data("segment_info", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } sbi->s_kobj.kset = f2fs_kset; init_completion(&sbi->s_kobj_unregister); @@ -1441,7 +1818,7 @@ try_onemore: * previous checkpoint was not done by clean system shutdown. */ if (bdev_read_only(sb->s_bdev) && - !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; goto free_kobj; } @@ -1449,14 +1826,27 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - err = recover_fsync_data(sbi); - if (err) { + if (!retry) + goto skip_recovery; + + err = recover_fsync_data(sbi, false); + if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); + "Cannot recover all fsync data errno=%d", err); + goto free_kobj; + } + } else { + err = recover_fsync_data(sbi, true); + + if (!f2fs_readonly(sb) && err > 0) { + err = -EINVAL; + f2fs_msg(sb, KERN_ERR, + "Need to recover fsync data"); goto free_kobj; } } +skip_recovery: /* recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -1473,20 +1863,26 @@ try_onemore: kfree(options); /* recover broken superblock */ - if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) { - f2fs_msg(sb, KERN_INFO, "Recover invalid superblock"); - f2fs_commit_super(sbi, true); + if (recovery) { + err = f2fs_commit_super(sbi, true); + f2fs_msg(sb, KERN_INFO, + "Try to recover %dth superblock, ret: %d", + sbi->valid_super_block ? 1 : 2, err); } - sbi->cp_expires = round_jiffies_up(jiffies); - + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); return 0; free_kobj: + f2fs_sync_inode_meta(sbi); kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); @@ -1494,7 +1890,9 @@ free_root_inode: dput(sb->s_root); sb->s_root = NULL; free_node_inode: + truncate_inode_pages_final(NODE_MAPPING(sbi)); mutex_lock(&sbi->umount_mutex); + release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); @@ -1502,16 +1900,18 @@ free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); -free_cp: kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); free_options: + destroy_percpu_info(sbi); kfree(options); free_sb_buf: - brelse(raw_super_buf); + kfree(raw_super); free_sbi: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); kfree(sbi); /* give only one another chance */ @@ -1547,8 +1947,9 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { - f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info)); + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT, NULL); if (!f2fs_inode_cachep) return -ENOMEM; return 0; @@ -1590,25 +1991,23 @@ static int __init init_f2fs_fs(void) err = -ENOMEM; goto free_extent_cache; } - err = f2fs_init_crypto(); - if (err) - goto free_kset; - err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_crypto; + goto free_kset; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - f2fs_create_root_stats(); + err = f2fs_create_root_stats(); + if (err) + goto free_filesystem; f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_filesystem: + unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_crypto: - f2fs_exit_crypto(); free_kset: kset_unregister(f2fs_kset); free_extent_cache: @@ -1629,15 +2028,14 @@ static void __exit exit_f2fs_fs(void) { remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); - unregister_shrinker(&f2fs_shrinker_info); unregister_filesystem(&f2fs_fs_type); - f2fs_exit_crypto(); + unregister_shrinker(&f2fs_shrinker_info); + kset_unregister(f2fs_kset); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); - kset_unregister(f2fs_kset); f2fs_destroy_trace_ios(); } diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 145fb659ad44..562ce0821559 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -29,7 +29,8 @@ static inline void __print_last_io(void) last_io.major, last_io.minor, last_io.pid, "----------------", last_io.type, - last_io.fio.rw, last_io.fio.blk_addr, + last_io.fio.rw, + last_io.fio.new_blkaddr, last_io.len); memset(&last_io, 0, sizeof(last_io)); } @@ -101,7 +102,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) last_io.pid == pid && last_io.type == __file_type(inode, pid) && last_io.fio.rw == fio->rw && - last_io.fio.blk_addr + last_io.len == fio->blk_addr) { + last_io.fio.new_blkaddr + last_io.len == + fio->new_blkaddr) { last_io.len++; return; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 862368a32e53..69c6bb9cf207 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -151,7 +151,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -264,18 +264,20 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } -static void *read_all_xattrs(struct inode *inode, struct page *ipage) +static int read_all_xattrs(struct inode *inode, struct page *ipage, + void **base_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; size_t size = PAGE_SIZE, inline_size = 0; void *txattr_addr; + int err; inline_size = inline_xattr_size(inode); txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) - return NULL; + return -ENOMEM; /* read from inline xattr */ if (inline_size) { @@ -286,8 +288,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_addr = inline_xattr_addr(ipage); } else { page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) + if (IS_ERR(page)) { + err = PTR_ERR(page); goto fail; + } inline_addr = inline_xattr_addr(page); } memcpy(txattr_addr, inline_addr, inline_size); @@ -301,8 +305,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) /* The inode already has an extended attribute block. */ xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); - if (IS_ERR(xpage)) + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); goto fail; + } xattr_addr = page_address(xpage); memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); @@ -316,10 +322,11 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); header->h_refcount = cpu_to_le32(1); } - return txattr_addr; + *base_addr = txattr_addr; + return 0; fail: kzfree(txattr_addr); - return NULL; + return err; } static inline int write_all_xattrs(struct inode *inode, __u32 hsize, @@ -345,7 +352,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(ipage); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); + set_page_dirty(ipage); } else { page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -353,7 +361,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(page); } inline_addr = inline_xattr_addr(page); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); } memcpy(inline_addr, txattr_addr, inline_size); f2fs_put_page(page, 1); @@ -374,7 +382,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(xpage); } f2fs_bug_on(sbi, new_nid); - f2fs_wait_on_page_writeback(xpage, NODE); + f2fs_wait_on_page_writeback(xpage, NODE, true); } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); @@ -412,9 +420,9 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; entry = __find_xattr(base_addr, index, len, name); if (IS_XATTR_LAST_ENTRY(entry)) { @@ -448,9 +456,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; - base_addr = read_all_xattrs(inode, NULL); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, NULL, &base_addr); + if (error) + return error; list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -481,13 +489,12 @@ static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) { - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *here, *last; void *base_addr; int found, newsize; size_t len; __u32 new_hsize; - int error = -ENOMEM; + int error = 0; if (name == NULL) return -EINVAL; @@ -503,9 +510,9 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - goto exit; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; /* find entry with wanted name. */ here = __find_xattr(base_addr, index, len, name); @@ -538,7 +545,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, free = free + ENTRY_SIZE(here); if (unlikely(free < newsize)) { - error = -ENOSPC; + error = -E2BIG; goto exit; } } @@ -566,7 +573,6 @@ static int __f2fs_setxattr(struct inode *inode, int index, * Before we come here, old entry is removed. * We just write new entry. */ - memset(last, 0, newsize); last->e_name_index = index; last->e_name_len = len; memcpy(last->e_name, name, len); @@ -580,19 +586,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; inode->i_ctime = CURRENT_TIME; - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - - if (ipage) - update_inode(inode, ipage); - else - update_inode_page(inode); + f2fs_mark_inode_dirty_sync(inode); + if (!error && S_ISDIR(inode->i_mode)) + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: kzfree(base_addr); return error; @@ -609,7 +613,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); /* protect xattr_ver */ @@ -618,5 +622,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); + f2fs_update_time(sbi, REQ_TIME); return err; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 71a7100d5492..d2fd0387a3c7 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -126,7 +126,8 @@ extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); #define f2fs_xattr_handlers NULL static inline int f2fs_setxattr(struct inode *inode, int index, - const char *name, const void *value, size_t size, int flags) + const char *name, const void *value, size_t size, + struct page *page, int flags) { return -EOPNOTSUPP; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 01448e01b40d..c066f6b56e58 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -229,6 +229,7 @@ struct dentry_operations { #define DCACHE_MAY_FREE 0x00800000 #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ #define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */ +#define DCACHE_ENCRYPTED_WITH_KEY 0x04000000 /* dir is encrypted with a valid key */ #define DCACHE_OP_REAL 0x08000000 extern seqlock_t rename_lock; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 3d6e6ce44c5c..e46e7d10312b 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -21,7 +21,7 @@ #define F2FS_BLKSIZE 4096 /* support only 4KB block */ #define F2FS_BLKSIZE_BITS 12 /* bits for F2FS_BLKSIZE */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ -#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) / F2FS_BLKSIZE) +#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS) #define NULL_ADDR ((block_t)0) /* used as block_t addresses */ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ @@ -51,6 +51,7 @@ #define MAX_ACTIVE_DATA_LOGS 8 #define VERSION_LEN 256 +#define MAX_VOLUME_NAME 512 /* * For superblock @@ -84,7 +85,7 @@ struct f2fs_super_block { __le32 node_ino; /* node inode number */ __le32 meta_ino; /* meta inode number */ __u8 uuid[16]; /* 128-bit uuid for volume */ - __le16 volume_name[512]; /* volume name */ + __le16 volume_name[MAX_VOLUME_NAME]; /* volume name */ __le32 extension_count; /* # of extensions below */ __u8 extension_list[F2FS_MAX_EXTENSION][8]; /* extension array */ __le32 cp_payload; @@ -99,6 +100,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 #define CP_FSCK_FLAG 0x00000010 #define CP_ERROR_FLAG 0x00000008 @@ -169,12 +171,12 @@ struct f2fs_extent { #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ -#define ADDRS_PER_INODE(fi) addrs_per_inode(fi) +#define ADDRS_PER_INODE(inode) addrs_per_inode(inode) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ #define NIDS_PER_BLOCK 1018 /* Node IDs in an Indirect Block */ -#define ADDRS_PER_PAGE(page, fi) \ - (IS_INODE(page) ? ADDRS_PER_INODE(fi) : ADDRS_PER_BLOCK) +#define ADDRS_PER_PAGE(page, inode) \ + (IS_INODE(page) ? ADDRS_PER_INODE(inode) : ADDRS_PER_BLOCK) #define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1) #define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2) @@ -261,7 +263,7 @@ struct f2fs_node { /* * For NAT entries */ -#define NAT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_nat_entry)) +#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ @@ -281,7 +283,7 @@ struct f2fs_nat_block { * Not allow to change this. */ #define SIT_VBLOCK_MAP_SIZE 64 -#define SIT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_sit_entry)) +#define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry)) /* * F2FS uses 4 bytes to represent block address. As a result, supported size of @@ -350,7 +352,7 @@ struct f2fs_summary { struct summary_footer { unsigned char entry_type; /* SUM_TYPE_XXX */ - __u32 check_sum; /* summary checksum */ + __le32 check_sum; /* summary checksum */ } __packed; #define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\ @@ -363,6 +365,12 @@ struct summary_footer { sizeof(struct sit_journal_entry)) #define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\ sizeof(struct sit_journal_entry)) + +/* Reserved area should make size of f2fs_extra_info equals to + * that of nat_journal and sit_journal. + */ +#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8) + /* * frequently updated NAT/SIT entries can be stored in the spare area in * summary blocks @@ -392,18 +400,28 @@ struct sit_journal { __u8 reserved[SIT_JOURNAL_RESERVED]; } __packed; -/* 4KB-sized summary block structure */ -struct f2fs_summary_block { - struct f2fs_summary entries[ENTRIES_IN_SUM]; +struct f2fs_extra_info { + __le64 kbytes_written; + __u8 reserved[EXTRA_INFO_RESERVED]; +} __packed; + +struct f2fs_journal { union { __le16 n_nats; __le16 n_sits; }; - /* spare area is used by NAT or SIT journals */ + /* spare area is used by NAT or SIT journals or extra info */ union { struct nat_journal nat_j; struct sit_journal sit_j; + struct f2fs_extra_info info; }; +} __packed; + +/* 4KB-sized summary block structure */ +struct f2fs_summary_block { + struct f2fs_summary entries[ENTRIES_IN_SUM]; + struct f2fs_journal journal; struct summary_footer footer; } __packed; @@ -497,4 +515,6 @@ enum { F2FS_FT_MAX }; +#define S_SHIFT 12 + #endif /* _LINUX_F2FS_FS_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index bff4ce57b77e..5b79adb9782e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -52,6 +52,8 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; +struct fscrypt_info; +struct fscrypt_operations; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -677,6 +679,9 @@ struct inode { struct hlist_head i_fsnotify_marks; #endif +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + struct fscrypt_info *i_crypt_info; +#endif void *i_private; /* fs or device private pointer */ }; @@ -1337,6 +1342,8 @@ struct super_block { #endif const struct xattr_handler **s_xattr; + const struct fscrypt_operations *s_cop; + struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h new file mode 100644 index 000000000000..76cff18bb032 --- /dev/null +++ b/include/linux/fscrypto.h @@ -0,0 +1,435 @@ +/* + * General per-file encryption definition + * + * Copyright (C) 2015, Google, Inc. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#ifndef _LINUX_FSCRYPTO_H +#define _LINUX_FSCRYPTO_H + +#include +#include +#include +#include +#include +#include +#include + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +/* Encryption parameters */ +#define FS_XTS_TWEAK_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 +#define FS_MAX_KEY_SIZE 64 + +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* This is passed in from userspace into the kernel keyring */ +struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; +} __packed; + +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct key *ci_keyring_key; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_WRITE_PATH_FL 0x00000002 + +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ + u8 mode; /* Encryption mode for tfm */ +}; + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + +static inline int fscrypt_key_size(int mode) +{ + switch (mode) { + case FS_ENCRYPTION_MODE_AES_256_XTS: + return FS_AES_256_XTS_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_GCM: + return FS_AES_256_GCM_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_CBC: + return FS_AES_256_CBC_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_CTS: + return FS_AES_256_CTS_KEY_SIZE; + default: + BUG(); + } + return 0; +} + +#define FS_FNAME_NUM_SCATTER_ENTRIES 4 +#define FS_CRYPTO_BLOCK_SIZE 16 +#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 fscrypt_symlink_data_len(u32 l) +{ + if (l < FS_CRYPTO_BLOCK_SIZE) + l = FS_CRYPTO_BLOCK_SIZE; + return (l + sizeof(struct fscrypt_symlink_data) - 1); +} + +struct fscrypt_str { + unsigned char *name; + u32 len; +}; + +struct fscrypt_name { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + u32 hash; + u32 minor_hash; + struct fscrypt_str crypto_buf; +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * crypto opertions for filesystems + */ +struct fscrypt_operations { + int (*get_context)(struct inode *, void *, size_t); + int (*key_prefix)(struct inode *, u8 **); + int (*prepare_context)(struct inode *); + int (*set_context)(struct inode *, const void *, size_t, void *); + int (*dummy_context)(struct inode *); + bool (*is_encrypted)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + if (inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode)) + return true; + return false; +} + +static inline bool fscrypt_valid_contents_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_XTS); +} + +static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); +} + +static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size) +{ + if (size == fscrypt_key_size(mode)) + return size; + return 0; +} + +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +static inline struct page *fscrypt_control_page(struct page *page) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +#else + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +#endif +} + +static inline int fscrypt_has_encryption_key(struct inode *inode) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return (inode->i_crypt_info != NULL); +#else + return 0; +#endif +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); +#endif +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +extern const struct dentry_operations fscrypt_d_ops; +#endif + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + d_set_d_op(dentry, &fscrypt_d_ops); +#endif +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +/* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; +int fscrypt_initialize(void); + +extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t); +extern void fscrypt_release_ctx(struct fscrypt_ctx *); +extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t); +extern int fscrypt_decrypt_page(struct page *); +extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_pullback_bio_page(struct page **, bool); +extern void fscrypt_restore_control_page(struct page *); +extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, + unsigned int); +/* policy.c */ +extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); +extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); +extern int fscrypt_has_permitted_context(struct inode *, struct inode *); +extern int fscrypt_inherit_context(struct inode *, struct inode *, + void *, bool); +/* keyinfo.c */ +extern int get_crypt_info(struct inode *); +extern int fscrypt_get_encryption_info(struct inode *); +extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); + +/* fname.c */ +extern int fscrypt_setup_filename(struct inode *, const struct qstr *, + int lookup, struct fscrypt_name *); +extern void fscrypt_free_filename(struct fscrypt_name *); +extern u32 fscrypt_fname_encrypted_size(struct inode *, u32); +extern int fscrypt_fname_alloc_buffer(struct inode *, u32, + struct fscrypt_str *); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, + const struct fscrypt_str *, struct fscrypt_str *); +extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, + struct fscrypt_str *); +#endif + +/* crypto.c */ +static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i, + gfp_t f) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c) +{ + return; +} + +static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i, + struct page *p, gfp_t f) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int fscrypt_notsupp_decrypt_page(struct page *p) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c, + struct bio *b) +{ + return; +} + +static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b) +{ + return; +} + +static inline void fscrypt_notsupp_restore_control_page(struct page *p) +{ + return; +} + +static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, + sector_t s, unsigned int f) +{ + return -EOPNOTSUPP; +} + +/* policy.c */ +static inline int fscrypt_notsupp_process_policy(struct file *f, + const struct fscrypt_policy *p) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_get_policy(struct inode *i, + struct fscrypt_policy *p) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_has_permitted_context(struct inode *p, + struct inode *i) +{ + return 0; +} + +static inline int fscrypt_notsupp_inherit_context(struct inode *p, + struct inode *i, void *v, bool b) +{ + return -EOPNOTSUPP; +} + +/* keyinfo.c */ +static inline int fscrypt_notsupp_get_encryption_info(struct inode *i) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_put_encryption_info(struct inode *i, + struct fscrypt_info *f) +{ + return; +} + + /* fname.c */ +static inline int fscrypt_notsupp_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct fscrypt_name *fname) +{ + if (dir->i_sb->s_cop->is_encrypted(dir)) + return -EOPNOTSUPP; + + memset(fname, 0, sizeof(struct fscrypt_name)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname) +{ + return; +} + +static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s) +{ + /* never happens */ + WARN_ON(1); + return 0; +} + +static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode, + u32 ilen, struct fscrypt_str *crypto_str) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c) +{ + return; +} + +static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} +#endif /* _LINUX_FSCRYPTO_H */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 00b4a6308249..3a09bb4dc3b2 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -52,6 +52,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -693,28 +694,32 @@ TRACE_EVENT(f2fs_direct_IO_exit, __entry->ret) ); -TRACE_EVENT(f2fs_reserve_new_block, +TRACE_EVENT(f2fs_reserve_new_blocks, - TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node), + TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node, + blkcnt_t count), - TP_ARGS(inode, nid, ofs_in_node), + TP_ARGS(inode, nid, ofs_in_node, count), TP_STRUCT__entry( __field(dev_t, dev) __field(nid_t, nid) __field(unsigned int, ofs_in_node) + __field(blkcnt_t, count) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = nid; __entry->ofs_in_node = ofs_in_node; + __entry->count = count; ), - TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u", + TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", show_dev(__entry), (unsigned int)__entry->nid, - __entry->ofs_in_node) + __entry->ofs_in_node, + (unsigned long long)__entry->count) ); DECLARE_EVENT_CLASS(f2fs__submit_page_bio, @@ -727,7 +732,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, index) - __field(block_t, blkaddr) + __field(block_t, old_blkaddr) + __field(block_t, new_blkaddr) __field(int, rw) __field(int, type) ), @@ -736,16 +742,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->dev = page->mapping->host->i_sb->s_dev; __entry->ino = page->mapping->host->i_ino; __entry->index = page->index; - __entry->blkaddr = fio->blk_addr; + __entry->old_blkaddr = fio->old_blkaddr; + __entry->new_blkaddr = fio->new_blkaddr; __entry->rw = fio->rw; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "blkaddr = 0x%llx, rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s", show_dev_ino(__entry), (unsigned long)__entry->index, - (unsigned long long)__entry->blkaddr, + (unsigned long long)__entry->old_blkaddr, + (unsigned long long)__entry->new_blkaddr, show_bio_type(__entry->rw), show_block_type(__entry->type)) ); @@ -1265,6 +1273,44 @@ TRACE_EVENT(f2fs_destroy_extent_tree, __entry->node_cnt) ); +DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(s64, count) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->type = type; + __entry->count = count; + ), + + TP_printk("dev = (%d,%d), %s, dirty count = %lld", + show_dev(__entry), + show_file_type(__entry->type), + __entry->count) +); + +DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_enter, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count) +); + +DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index cd0c1a1a9ccf..15048097910f 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -172,6 +172,24 @@ struct inodes_stat_t { #define FS_IOC32_GETVERSION _IOR('v', 1, int) #define FS_IOC32_SETVERSION _IOW('v', 2, int) +/* + * File system encryption support + */ +/* Policy provided via an ioctl on the topmost directory */ +#define FS_KEY_DESCRIPTOR_SIZE 8 + +struct fscrypt_policy { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; +} __packed; + +#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) +#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) */ From ae81ccb3bdcd879f889d36dcec76fccdd2f329c4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 13:38:41 -0700 Subject: [PATCH 1053/3220] f2fs: fix wrong sum_page pointer in f2fs_gc commit de0dcc40f6e24d6bac6b60e36eac4659bbbd3f00 upstream. This patch fixes using a wrong pointer for sum_page in f2fs_gc. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0a0a1ad1fe1f..4336807cc690 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -848,16 +848,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, for (segno = start_segno; segno < end_segno; segno++) { - if (get_valid_blocks(sbi, segno, 1) == 0 || - unlikely(f2fs_cp_error(sbi))) - goto next; - /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - f2fs_bug_on(sbi, !PageUptodate(sum_page)); f2fs_put_page(sum_page, 0); + if (get_valid_blocks(sbi, segno, 1) == 0 || + !PageUptodate(sum_page) || + unlikely(f2fs_cp_error(sbi))) + goto next; + sum = page_address(sum_page); f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); From 91d38ba8414bdbe7161d2d02eddf5e671db0024d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Nov 2016 14:06:03 -0800 Subject: [PATCH 1054/3220] posix_acl: Clear SGID bit when setting file permissions commit 073931017b49d9458aa351605b43a7e34598caef upstream. Cherry-pick to f2fs only for generic/375 from: (073931017: posix_acl: Clear SGID bit when setting file permissions) Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fb0744b94c2f..4a34040932e9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -215,12 +215,10 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; set_acl_inode(inode, inode->i_mode); - if (error == 0) - acl = NULL; } break; From ab6f3626a82db73b2a490804fe6591750c5b2233 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Nov 2016 10:51:17 -0800 Subject: [PATCH 1055/3220] f2fs: fix overflow due to condition check order commit e87f7329bbd6760c2acc4f1eb423362b08851a71 upstream. In the last ilen case, i was already increased, resulting in accessing out- of-boundary entry of do_replace and blkaddr. Fix to check ilen first to exit the loop. Fixes: 2aa8fbb9693020 ("f2fs: refactor __exchange_data_block for speed up") Cc: stable@vger.kernel.org # 4.8+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c6e33258fabf..5c4ea4cf2fb1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -971,7 +971,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, new_size = (dst + i) << PAGE_SHIFT; if (dst_inode->i_size < new_size) f2fs_i_size_write(dst_inode, new_size); - } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); f2fs_put_dnode(&dn); } else { From 6b266c3a9951abd948effac21d19ced0dee07dda Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Nov 2016 12:45:15 -0800 Subject: [PATCH 1056/3220] f2fs: fix to determine start_cp_addr by sbi->cur_cp_pack commit 8508e44ae98622f841f5ef29d0bf3d5db4e0c1cc upstream. We don't guarantee cp_addr is fixed by cp_version. This is to sync with f2fs-tools. Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +++++++- fs/f2fs/f2fs.h | 28 +++++++++++++++++----------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index cb23d6cf676b..1608ae8eea97 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -770,6 +770,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) if (sanity_check_ckpt(sbi)) goto fail_no_cp; + if (cur_page == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; + if (cp_blks <= 1) goto done; @@ -1121,7 +1126,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); - start_blk = __start_cp_addr(sbi); + start_blk = __start_cp_next_addr(sbi); /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); @@ -1185,6 +1190,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); + __set_cp_next_pack(sbi); /* * redirty superblock if metadata like node page or inode cache is diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index af293e84e5cd..45d1e4522760 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -789,6 +789,7 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ @@ -1354,22 +1355,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { - block_t start_addr; - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = cur_cp_version(ckpt); + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - - /* - * odd numbered checkpoint should at cp segment 0 - * and even segment must be at cp segment 1 - */ - if (!(ckpt_version & 1)) + if (sbi->cur_cp_pack == 2) start_addr += sbi->blocks_per_seg; - return start_addr; } +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + if (sbi->cur_cp_pack == 1) + start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) +{ + sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; +} + static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); From 311aa690ef7bd32732a65d957c2490143c7b9830 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:34 +0800 Subject: [PATCH 1057/3220] f2fs: exclude free nids building and allocation commit 2411cf5befa5804e4ced4c45a3212d7653869286 upstream. During nid allocation, it needs to exclude building and allocating flow of free nids, this is because while building free nid cache, there are two steps: a) load free nids from unused nat entries in NAT pages, b) update free nid cache by checking nat journal. The two steps should be atomical, otherwise an used nid can be allocated as free one after a) and before b). This patch adds missing lock which covers build_free_nids in unlock_operation and f2fs_balance_fs_bg to avoid that. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b1e615ed2bef..a8c2bd3e5029 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1786,7 +1786,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -void build_free_nids(struct f2fs_sb_info *sbi) +void __build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1840,6 +1840,13 @@ void build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } +void build_free_nids(struct f2fs_sb_info *sbi) +{ + mutex_lock(&NM_I(sbi)->build_lock); + __build_free_nids(sbi); + mutex_unlock(&NM_I(sbi)->build_lock); +} + /* * If this function returns success, caller can obtain a new nid * from second parameter of this function. @@ -1876,9 +1883,7 @@ retry: spin_unlock(&nm_i->free_nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - mutex_lock(&nm_i->build_lock); build_free_nids(sbi); - mutex_unlock(&nm_i->build_lock); goto retry; } From ab38818bdd9a67f9ce6aade05f61e373d45ae132 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:00 +0800 Subject: [PATCH 1058/3220] f2fs: fix to release discard entries during checkpoint commit 2dd15654ac0abe587a245a09a7823bbbd588bfb7 upstream. In f2fs_fill_super, if there is any IO error occurs during recovery, cached discard entries will be leaked, in order to avoid this, make write_checkpoint() handle memory release by itself, besides, move clear_prefree_segments to write_checkpoint for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 ++++- fs/f2fs/super.c | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1608ae8eea97..63ca342a3cc8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1187,7 +1187,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); __set_cp_next_pack(sbi); @@ -1264,6 +1263,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); + if (err) + release_discard_addrs(sbi); + else + clear_prefree_segments(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fd249cc9b96e..006138a6c5ab 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -738,7 +738,6 @@ static void f2fs_put_super(struct super_block *sb) * In addition, EIO will skip do checkpoint, we need this as well. */ release_ino_entry(sbi, true); - release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); From 75bb19d8b7479bfa62aef3a5e13bf737144ef76b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:01 +0800 Subject: [PATCH 1059/3220] f2fs: give a chance to detach from dirty list commit 933439c8f3474e329709b715b43b0b8168bbecf8 upstream. If there is no dirty pages in inode, we should give a chance to detach the inode from global dirty list, otherwise it needs to call another unnecessary .writepages for detaching. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 +++++--- fs/f2fs/dir.c | 1 + fs/f2fs/gc.c | 4 +++- fs/f2fs/inline.c | 4 +++- fs/f2fs/node.c | 1 + fs/f2fs/segment.c | 4 +++- 6 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fc9bfd8f566d..4000ccb85a78 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1817,12 +1817,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, return; if (PageDirty(page)) { - if (inode->i_ino == F2FS_META_INO(sbi)) + if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); - else if (inode->i_ino == F2FS_NODE_INO(sbi)) + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); - else + } else { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } } /* This is atomic written page, keep Private */ diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e634a637c443..c0dba11519cf 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); + remove_dirty_inode(dir); } f2fs_put_page(page, 1); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4336807cc690..72a0ca08f901 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -670,8 +670,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) retry: set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } set_cold_data(page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b150d03beec1..a1acbc67d827 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -154,8 +154,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); - if (dirty) + if (dirty) { inode_dec_dirty_pages(dn->inode); + remove_dirty_inode(dn->inode); + } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a8c2bd3e5029..97eb2c0811b5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1203,6 +1203,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b3c61ae37f92..75477ec6c535 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -272,8 +272,10 @@ static int __commit_inmem_pages(struct inode *inode, set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } fio.page = page; err = do_write_data_page(&fio); From 4c7eae1fef0c32f292507632ed7cbc9b0416b00e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:02 +0800 Subject: [PATCH 1060/3220] f2fs: add missing f2fs_balance_fs in f2fs_zero_range commit 9434fcde1fa0f48e1a29fbdd9d436fa279aeb909 upstream. f2fs_balance_fs should be called in between node page updating, otherwise node page count will exceeded far beyond watermark of triggering foreground garbage collection, result in facing high risk of hitting LFS allocation failure. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c4ea4cf2fb1..c0774c98dce4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1222,6 +1222,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); + if (ret) goto out; From 332f40b43f2f4a86eec4a2aaf1d3e3295ed673f2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:03 +0800 Subject: [PATCH 1061/3220] f2fs: don't miss any f2fs_balance_fs cases commit 6f2d8ed654bfa391854df4de854953f772a16a9d upstream. In f2fs_map_blocks, let f2fs_balance_fs detects node page modification with dn.node_changed to avoid miss some corner cases. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4000ccb85a78..c390542917e4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -671,7 +671,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; struct extent_info ei; - bool allocated = false; block_t blkaddr; if (!maxblocks) @@ -730,10 +729,8 @@ next_block: } } else { err = __allocate_data_block(&dn); - if (!err) { + if (!err) set_inode_flag(inode, FI_APPEND_WRITE); - allocated = true; - } } if (err) goto sync_out; @@ -788,7 +785,6 @@ skip: err = reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; - allocated = dn.node_changed; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { @@ -807,9 +803,8 @@ skip: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } - allocated = false; goto next_dnode; sync_out: @@ -817,7 +812,7 @@ sync_out: unlock_out: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, err); From 9e3d0bf6d3714bc3481e32828bd56eecc0700d29 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:04 +0800 Subject: [PATCH 1062/3220] f2fs: be aware of extent beyond EOF in fiemap commit 58736fa60f6ae659ac72da8b1580c308b47e8edd upstream. f2fs can support fallocating blocks beyond file size without changing the size, but ->fiemap of f2fs was restricted and can't detect these extents fallocated past EOF, now relieve the restriction. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c390542917e4..0cdabfce6c17 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -881,7 +881,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head map_bh; sector_t start_blk, last_blk; pgoff_t next_pgofs; - loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; @@ -898,13 +897,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); - isize = i_size_read(inode); - if (start >= isize) - goto out; - - if (start + len > isize) - len = isize - start; - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); @@ -923,13 +915,11 @@ next: /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk = next_pgofs; - /* Go through holes util pass the EOF */ - if (blk_to_logical(inode, start_blk) < isize) + + if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + F2FS_I_SB(inode)->max_file_blocks)) goto prep_next; - /* Found a hole beyond isize means no more extents. - * Note that the premise is that filesystems don't - * punch holes beyond isize and keep size unchanged. - */ + flags |= FIEMAP_EXTENT_LAST; } From 1158df42b2c66b4c9f150615ae37a1cc7edb3c74 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:05 +0800 Subject: [PATCH 1063/3220] f2fs: fix to update largest extent under lock commit b691d98fdd4cc2514c60fd6975e6016da203e64f upstream. In order to avoid racing problem, make largest extent cache being updated under lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d7369895a78a..1fbebcb33a9d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -252,6 +252,7 @@ retry: int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; + struct extent_tree *et = F2FS_I(inode)->extent_tree; f2fs_inode_synced(inode); @@ -267,11 +268,13 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); - if (F2FS_I(inode)->extent_tree) - set_raw_extent(&F2FS_I(inode)->extent_tree->largest, - &ri->i_ext); - else + if (et) { + read_lock(&et->lock); + set_raw_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); From c675400f4a1149234ba1efeeb8a7d6df8a4bb880 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:06 +0800 Subject: [PATCH 1064/3220] f2fs: fix error handling in fsync_node_pages commit 9de69279750e9740bc7221c7051a40c0516a58fb upstream. In fsync_node_pages, if f2fs was taged with CP_ERROR_FLAG, make sure bio cache was flushed before return. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 97eb2c0811b5..bc38e5a92b4b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1338,7 +1338,8 @@ retry: if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); pagevec_release(&pvec); - return -EIO; + ret = -EIO; + goto out; } if (!IS_DNODE(page) || !is_cold_node(page)) @@ -1411,7 +1412,7 @@ continue_unlock: unlock_page(last_page); goto retry; } - +out: if (nwritten) f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); return ret ? -EIO: 0; From fbeee49e0605a0c723bd0d870d59175a764377c6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 11 Oct 2016 10:36:12 -0700 Subject: [PATCH 1065/3220] f2fs: fix sparse warnings commit 0c0b471e43e7acf0747c6eb410863bf78c14750d upstream. f2fs contained a number of endianness conversion bugs. Also, one function should have been 'static'. Found with sparse by running 'make C=2 CF=-D__CHECK_ENDIAN__ fs/f2fs/' Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/inline.c | 2 +- fs/f2fs/node.c | 5 +++-- fs/f2fs/node.h | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c0dba11519cf..7136dc1ade11 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, /* show encrypted name */ if (fname->hash) { - if (de->hash_code == fname->hash) + if (de->hash_code == cpu_to_le32(fname->hash)) goto found; } else if (de_name.len == name->len && de->hash_code == namehash && diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a1acbc67d827..56363c18489a 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -438,7 +438,7 @@ static int f2fs_add_inline_entries(struct inode *dir, } new_name.name = d.filename[bit_pos]; - new_name.len = de->name_len; + new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); fake_mode = get_de_type(de) << S_SHIFT; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc38e5a92b4b..d2ba37a84f8e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } else { - f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || - nat_get_blkaddr(e) != ne->block_addr || + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); } } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 868bec65e51c..cfdcf98516a1 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -313,7 +313,7 @@ static inline bool is_recoverable_dnode(struct page *page) ((unsigned char *)ckpt + crc_offset))); cp_ver |= (crc << 32); } - return cpu_to_le64(cp_ver) == cpver_of_node(page); + return cp_ver == cpver_of_node(page); } /* From 8db338877d72a24743a3808d4bc774f74a058f8f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:56:59 +0800 Subject: [PATCH 1066/3220] f2fs: clear nlink if fail to add_link commit a11b9f65eae766b17ec3451a6a1766f0a9d1dbff upstream. We don't need to keep incomplete created inode in cache, so if we fail to add link into directory during new inode creation, it's better to set nlink of inode to zero, then we can evict inode immediately. Otherwise release of nid belong to inode will be delayed until inode cache is being shrunk, it may cause a seemingly endless loop while allocating free nids in time of testing generic/269 case of fstest suit. Signed-off-by: Chao Yu [Jaegeuk Kim: add update_inode_page to fix kernel panic] Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1fbebcb33a9d..d32fd0343eae 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -387,6 +387,8 @@ retry: f2fs_lock_op(sbi); err = remove_inode_page(inode); f2fs_unlock_op(sbi); + if (err == -ENOENT) + err = 0; } /* give more chances, if ENOMEM case */ @@ -427,6 +429,18 @@ void handle_failed_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + update_inode_page(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); From e18c2624506f44b87f505adc36b78782eacd8628 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Oct 2016 19:28:29 +0800 Subject: [PATCH 1067/3220] f2fs: split free nid list commit b8559dc242d1d47dcf99660a4d6afded727e0cc0 upstream. During free nid allocation, in order to do preallocation, we will tag free nid entry as allocated one and still leave it in free nid list, for other allocators who want to grab free nids, it needs to traverse the free nid list for lookup. It becomes overhead in scenario of allocating free nid intensively by multithreads. This patch splits free nid list to two list: {free,alloc}_nid_list, to keep free nids and preallocated free nids separately, after that, traverse latency will be gone, besides split nid_cnt for separate statistic. Additionally, introduce __insert_nid_to_list and __remove_nid_from_list for cleanup. Signed-off-by: Chao Yu [Jaegeuk Kim: modify f2fs_bug_on to avoid needless branches] Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 11 ++-- fs/f2fs/f2fs.h | 14 +++-- fs/f2fs/node.c | 136 +++++++++++++++++++++++++++------------------ fs/f2fs/node.h | 11 ++-- fs/f2fs/shrinker.c | 4 +- 5 files changed, 108 insertions(+), 68 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 1c35e80732e0..5cab80dfa4dc 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -74,7 +74,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->fnids = NM_I(sbi)->fcnt; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -194,7 +195,9 @@ get_cache: si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ - si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid); + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + + NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); @@ -324,8 +327,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d\n", - si->fnids); + seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", + si->free_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 45d1e4522760..cec025852c22 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -529,6 +529,12 @@ static inline void __try_update_largest_extent(struct inode *inode, } } +enum nid_list { + FREE_NID_LIST, + ALLOC_NID_LIST, + MAX_NID_LIST, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ @@ -548,9 +554,9 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head free_nid_list; /* a list for free nids */ - spinlock_t free_nid_list_lock; /* protect free nid list */ - unsigned int fcnt; /* the number of free node id */ + struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ + unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ /* for checkpoint */ @@ -2214,7 +2220,7 @@ struct f2fs_stat_info { s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; s64 inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, fnids; + int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; int bg_gc, wb_bios; int inline_xattr, inline_inode, inline_dir, orphans; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d2ba37a84f8e..5bb2fa324e68 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_SHIFT; + mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> @@ -1699,10 +1699,31 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, struct free_nid *i) { - list_del(&i->list); radix_tree_delete(&nm_i->free_nid_root, i->nid); } +static void __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]++; + list_add_tail(&i->list, &nm_i->nid_list[list]); +} + +static void __remove_nid_from_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]--; + list_del(&i->list); +} + static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1733,33 +1754,33 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) return 0; } - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); kmem_cache_free(free_nid_slab, i); return 0; } - list_add_tail(&i->list, &nm_i->free_nid_list); - nm_i->fcnt++; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, FREE_NID_LIST); + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); return 1; } -static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; bool need_free = false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; need_free = true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1798,7 +1819,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi) nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ @@ -1834,7 +1855,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi) if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else - remove_free_nid(nm_i, nid); + remove_free_nid(sbi, nid); } up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); @@ -1867,23 +1888,22 @@ retry: if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); - list_for_each_entry(i, &nm_i->free_nid_list, list) - if (i->state == NID_NEW) - break; - - f2fs_bug_on(sbi, i->state != NID_NEW); + if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); + i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + struct free_nid, list); *nid = i->nid; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST); i->state = NID_ALLOC; - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST); + spin_unlock(&nm_i->nid_list_lock); return true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ build_free_nids(sbi); @@ -1898,11 +1918,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); __del_from_free_nid_list(nm_i, i); - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); } @@ -1919,17 +1940,20 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) if (!nid) return; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); + if (!available_free_memory(sbi, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); need_free = true; } else { i->state = NID_NEW; - nm_i->fcnt++; + __insert_nid_to_list(sbi, i, FREE_NID_LIST); } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1941,24 +1965,26 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->fcnt <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], + list) { + if (nr_shrink <= 0 || + nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - if (i->state == NID_ALLOC) - continue; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); + kmem_cache_free(free_nid_slab, i); - nm_i->fcnt--; nr_shrink--; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); mutex_unlock(&nm_i->build_lock); return nr - nr_shrink; @@ -2014,7 +2040,7 @@ recover_xnid: if (unlikely(!inc_valid_node_count(sbi, inode))) f2fs_bug_on(sbi, 1); - remove_free_nid(NM_I(sbi), new_xnid); + remove_free_nid(sbi, new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); @@ -2044,7 +2070,7 @@ retry: } /* Should not use this inode from free nid list */ - remove_free_nid(NM_I(sbi), ino); + remove_free_nid(sbi, ino); if (!PageUptodate(ipage)) SetPageUptodate(ipage); @@ -2278,20 +2304,22 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; - nm_i->fcnt = 0; + nm_i->nid_cnt[FREE_NID_LIST] = 0; + nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); + INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); mutex_init(&nm_i->build_lock); - spin_lock_init(&nm_i->free_nid_list_lock); + spin_lock_init(&nm_i->nid_list_lock); init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); @@ -2336,17 +2364,19 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) return; /* destroy free nid list */ - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - f2fs_bug_on(sbi, i->state == NID_ALLOC); + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], + list) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->fcnt); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); + f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); + f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ down_write(&nm_i->nat_tree_lock); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index cfdcf98516a1..e7997e240366 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - spin_lock(&nm_i->free_nid_list_lock); - if (nm_i->fcnt <= 0) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + struct free_nid, list); *nid = fnid->nid; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); } /* diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 46c915425923..ec539f407cc4 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -26,8 +26,8 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) - return NM_I(sbi)->fcnt - MAX_FREE_NIDS; + if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS) + return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; return 0; } From 6a248819a280ff0c72ef4d9943ead88d7d6de0cb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 10:09:59 -0700 Subject: [PATCH 1068/3220] f2fs: clean up free nid list operations commit eb0aa4b80784b8551bd5be577024e067bc83ef94 upstream. This patch cleans up to use consistent free nid list ops. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 56 +++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5bb2fa324e68..ef5357c7af24 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1696,25 +1696,26 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, - struct free_nid *i) -{ - radix_tree_delete(&nm_i->free_nid_root, i->nid); -} - -static void __insert_nid_to_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list) +static int __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool new) { struct f2fs_nm_info *nm_i = NM_I(sbi); + if (new) { + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; + } + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : i->state != NID_ALLOC); nm_i->nid_cnt[list]++; list_add_tail(&i->list, &nm_i->nid_list[list]); + return 0; } static void __remove_nid_from_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list) + struct free_nid *i, enum nid_list list, bool reuse) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1722,6 +1723,8 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, i->state != NID_ALLOC); nm_i->nid_cnt[list]--; list_del(&i->list); + if (!reuse) + radix_tree_delete(&nm_i->free_nid_root, i->nid); } static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) @@ -1729,6 +1732,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; + int err; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1755,15 +1759,13 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) } spin_lock(&nm_i->nid_list_lock); - if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->nid_list_lock); - radix_tree_preload_end(); + err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + spin_unlock(&nm_i->nid_list_lock); + radix_tree_preload_end(); + if (err) { kmem_cache_free(free_nid_slab, i); return 0; } - __insert_nid_to_list(sbi, i, FREE_NID_LIST); - spin_unlock(&nm_i->nid_list_lock); - radix_tree_preload_end(); return 1; } @@ -1776,8 +1778,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -1897,9 +1898,9 @@ retry: struct free_nid, list); *nid = i->nid; - __remove_nid_from_list(sbi, i, FREE_NID_LIST); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; - __insert_nid_to_list(sbi, i, ALLOC_NID_LIST); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); return true; } @@ -1921,8 +1922,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -1944,14 +1944,13 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); - if (!available_free_memory(sbi, FREE_NIDS)) { - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); need_free = true; } else { + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); i->state = NID_NEW; - __insert_nid_to_list(sbi, i, FREE_NID_LIST); + __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } spin_unlock(&nm_i->nid_list_lock); @@ -1978,9 +1977,7 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); - + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2367,8 +2364,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_lock(&nm_i->nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], list) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); From e07f457ef777a1af0642b81af98b1e824748b265 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:35 +0800 Subject: [PATCH 1069/3220] f2fs: don't interrupt free nids building during nid allocation commit 3a2ad5672bb36ee9c07bab97dadc8b0f70d391f4 upstream. Let build_free_nids support sync/async methods, in allocation flow of nids, we use synchronuous method, so that we can avoid looping in alloc_nid when free memory is low; in unblock_operations and f2fs_balance_fs_bg we use asynchronuous method in where low memory condition can interrupt us. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 22 ++++++++++------------ fs/f2fs/segment.c | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 63ca342a3cc8..1d273f51bc1c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -990,7 +990,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - build_free_nids(sbi); + build_free_nids(sbi, false); f2fs_unlock_all(sbi); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cec025852c22..9e8d3c9af54a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2071,7 +2071,7 @@ void move_node_page(struct page *, int); int fsync_node_pages(struct f2fs_sb_info *, struct inode *, struct writeback_control *, bool); int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *); +void build_free_nids(struct f2fs_sb_info *, bool); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ef5357c7af24..5800a1082fe8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1734,9 +1734,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct nat_entry *ne; int err; - if (!available_free_memory(sbi, FREE_NIDS)) - return -1; - /* 0 nid should not be used */ if (unlikely(nid == 0)) return 0; @@ -1804,14 +1801,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) { - if (add_free_nid(sbi, start_nid, true) < 0) - break; - } + if (blk_addr == NULL_ADDR) + add_free_nid(sbi, start_nid, true); } } -void __build_free_nids(struct f2fs_sb_info *sbi) +void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1823,6 +1818,9 @@ void __build_free_nids(struct f2fs_sb_info *sbi) if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; + if (!sync && !available_free_memory(sbi, FREE_NIDS)) + return; + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1865,10 +1863,10 @@ void __build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi); + __build_free_nids(sbi, sync); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1907,7 +1905,7 @@ retry: spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi); + build_free_nids(sbi, true); goto retry; } @@ -2344,7 +2342,7 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi); + build_free_nids(sbi, true); return 0; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 75477ec6c535..48903702de27 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -380,7 +380,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi); + build_free_nids(sbi, false); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || From 9f99694bb7e0272568e804e96fb4ae5772acc198 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:36 +0800 Subject: [PATCH 1070/3220] f2fs: avoid casted negative value as shrink count commit 02110a4fd53164db7cce3bb2780dce4d6c4e058f upstream. This patch makes sure it returns a positive value instead of a probable casted negative value as shrink count. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/shrinker.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index ec539f407cc4..5c60fc28ec75 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -21,14 +21,16 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + + return count > 0 ? count : 0; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS) - return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; - return 0; + long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) From beb74f7757f803290f80b7d1dd5b85e64c28d223 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 13:28:05 -0700 Subject: [PATCH 1071/3220] f2fs: count dirty inodes to flush node pages during checkpoint commit b9610bdfcbdbb6017802ec6d1e073f445c98157d upstream. If there are a lot of dirty inodes, we need to flush all of them when doing checkpoint. So, we need to count this for enough free space. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index fecb856ad874..762743988426 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (test_opt(sbi, LFS)) return false; - return free_sections(sbi) <= (node_secs + 2 * dent_secs + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + 1); } @@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - - node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); + (node_secs + 2 * dent_secs + imeta_secs + + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) From 92c9dec3429503a3cd84ef5fa121c8782bf58af0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 13:30:31 -0700 Subject: [PATCH 1072/3220] f2fs: call f2fs_balance_fs for setattr commit 15d04354555fdfe8005e1365009e349148fb5f90 upstream. If inode becomes dirty, we need to check the # of dirty inodes whether or not further checkpoint would be required. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c0774c98dce4..53ba384cb675 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -695,7 +695,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is @@ -724,6 +723,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } f2fs_mark_inode_dirty_sync(inode); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(F2FS_I_SB(inode), true); + return err; } From c01ce254c71fd8e4a3539a5f46da77dc43859fe7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Oct 2016 15:36:31 -0700 Subject: [PATCH 1073/3220] f2fs: declare static function for __build_free_nids commit 3e7b5bbbef7f5eb8a19aa61b611c704bf8230937 upstream. This patch avoids build warning. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5800a1082fe8..e1ce0b8438fc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1806,7 +1806,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); From 04030d21a7b118fc9ba865223fb30017a0331cb6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Oct 2016 11:07:45 -0700 Subject: [PATCH 1074/3220] f2fs: use BIO_MAX_PAGES for bio allocation commit 664ba972df9b96942191db3068274cc1db899774 upstream. We don't need to allocate bio partially in order to maximize sequential writes. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 4 +--- fs/f2fs/node.c | 3 +-- fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 17 +++-------------- 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1d273f51bc1c..1dffe86651be 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -226,7 +226,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); + ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } static int f2fs_write_meta_page(struct page *page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0cdabfce6c17..e92cf046d9ff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -273,10 +273,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->new_blkaddr, - bio_blocks, is_read); + BIO_MAX_PAGES, is_read); io->fio = *fio; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e1ce0b8438fc..389be7f6e07c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2099,7 +2099,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, struct f2fs_node *rn; struct f2fs_summary *sum_entry; block_t addr; - int bio_blocks = MAX_BIO_BLOCKS(sbi); int i, idx, last_offset, nrpages; /* scan the node segment */ @@ -2108,7 +2107,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, bio_blocks); + nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ ra_meta_pages(sbi, addr, nrpages, META_POR, true); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 48903702de27..ec4d74c26067 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2239,10 +2239,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); + readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 762743988426..89ab4301ef02 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -102,8 +102,6 @@ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) -#define MAX_BIO_BLOCKS(sbi) \ - ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -696,13 +694,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) return false; } -static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(queue_max_sectors(q)); -} - /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. @@ -720,7 +711,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * MAX_BIO_BLOCKS(sbi); + return 8 * BIO_MAX_PAGES; else return 0; } @@ -737,11 +728,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - + desired = BIO_MAX_PAGES; if (type == NODE) - desired = 2 * max_hw_blocks(sbi); - else - desired = MAX_BIO_BLOCKS(sbi); + desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; From dafac77e8d20988a0a5f4ef81fd73fb79a2426e5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 10 Nov 2016 18:04:05 -0800 Subject: [PATCH 1075/3220] f2fs: Replace CURRENT_TIME_SEC with current_time() for inode timestamps commit 02027d42c3f747945f19111d3da2092ed2148ac8 upstream. This is for backport only. fs: Replace CURRENT_TIME_SEC with current_time() for inode timestamps Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 8 ++++---- fs/f2fs/f2fs.h | 22 ++++++++++++++++++++++ fs/f2fs/file.c | 8 ++++---- fs/f2fs/inline.c | 2 +- fs/f2fs/namei.c | 8 ++++---- fs/f2fs/xattr.c | 2 +- 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7136dc1ade11..3b8ebec0450b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -312,7 +312,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -465,7 +465,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (F2FS_I(dir)->i_current_depth != current_depth) @@ -683,7 +683,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { @@ -730,7 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, kunmap(page); /* kunmap - pair of f2fs_find_entry */ set_page_dirty(page); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9e8d3c9af54a..1938fe457041 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,6 +138,28 @@ static inline void inode_nohighmem(struct inode *inode) mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } +/** + * current_time - Return FS time + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs. + * + * Note that inode and inode->sb cannot be NULL. + * Otherwise, the function warns and returns time without truncation. + */ +static inline struct timespec current_time(struct inode *inode) +{ + struct timespec now = current_kernel_time(); + + if (unlikely(!inode->i_sb)) { + WARN(1, "current_time() called with uninitialized super_block in the inode"); + return now; + } + + return timespec_trunc(now, inode->i_sb->s_time_gran); +} + /* * For checkpoint manager */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 53ba384cb675..04a3205d2934 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -632,7 +632,7 @@ int f2fs_truncate(struct inode *inode) if (err) return err; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -708,7 +708,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; } - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); } } @@ -1402,7 +1402,7 @@ static long f2fs_fallocate(struct file *file, int mode, } if (!ret) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1494,7 +1494,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) fi->i_flags = flags; inode_unlock(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 56363c18489a..3b2f88788e02 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -591,7 +591,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (inode) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 0f071a70522d..ae29726afff0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_generation = sbi->s_next_generation++; err = insert_inode_locked(inode); @@ -182,7 +182,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_balance_fs(sbi, true); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); ihold(inode); set_inode_flag(inode, FI_INC_LINK); @@ -720,7 +720,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_set_link(new_dir, new_entry, new_page, old_inode); - new_inode->i_ctime = CURRENT_TIME; + new_inode->i_ctime = current_time(new_inode); down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); @@ -774,7 +774,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_inode->i_ctime = CURRENT_TIME; + old_inode->i_ctime = current_time(old_inode); f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 69c6bb9cf207..3a42405b6515 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -588,7 +588,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && From 0ef31c7bfa50fba5a0fb63dff3f0bef28d961427 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 11:51:23 -0700 Subject: [PATCH 1076/3220] f2fs: keep dirty inodes selectively for checkpoint commit 7c45729a4d6d1c90879e6c5c2df325c2f6db7191 upstream. This is to avoid no free segment bug during checkpoint caused by a number of dirty inodes. The case was reported by Chao like this. 1. mount with lazytime option 2. fill 4k file until disk is full 3. sync filesystem 4. read all files in the image 5. umount In this case, we actually don't need to flush dirty inode to inode page during checkpoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- fs/f2fs/dir.c | 6 +++--- fs/f2fs/extent_cache.c | 2 +- fs/f2fs/f2fs.h | 26 +++++++++++++------------- fs/f2fs/file.c | 9 +++++---- fs/f2fs/inline.c | 2 +- fs/f2fs/inode.c | 7 ++++--- fs/f2fs/namei.c | 6 +++--- fs/f2fs/super.c | 29 ++++++++++++++++------------- fs/f2fs/xattr.c | 4 ++-- 10 files changed, 49 insertions(+), 44 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 4a34040932e9..a45d1f4b7b0f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -387,7 +387,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 3b8ebec0450b..5594667c2f41 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); @@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2b06d4fcd954..4db44da7ef69 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode, if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1938fe457041..0d2502fdf892 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -541,13 +541,13 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *); +extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1680,7 +1680,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, return; case FI_DATA_EXIST: case FI_INLINE_DOTS: - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1707,7 +1707,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; set_inode_flag(inode, FI_ACL_MODE); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static inline void f2fs_i_links_write(struct inode *inode, bool inc) @@ -1716,7 +1716,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) inc_nlink(inode); else drop_nlink(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_blocks_write(struct inode *inode, @@ -1727,7 +1727,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode, inode->i_blocks = add ? inode->i_blocks + diff : inode->i_blocks - diff; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1741,7 +1741,7 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1756,19 +1756,19 @@ static inline bool f2fs_skip_inode_update(struct inode *inode) static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { F2FS_I(inode)->i_pino = pino; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) @@ -1896,13 +1896,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline int f2fs_readonly(struct super_block *sb) @@ -2054,7 +2054,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *); +int f2fs_inode_dirtied(struct inode *, bool); void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 04a3205d2934..ce38a350fb38 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -265,7 +265,7 @@ sync_nodes: } if (need_inode_block_update(sbi, ino)) { - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -633,7 +633,7 @@ int f2fs_truncate(struct inode *inode) return err; inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -722,7 +722,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - f2fs_mark_inode_dirty_sync(inode); + /* update attributes only */ + f2fs_mark_inode_dirty_sync(inode, false); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -1403,7 +1404,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3b2f88788e02..b859d5e377ae 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -592,7 +592,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d32fd0343eae..bfa512dde4ab 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -19,10 +19,11 @@ #include -void f2fs_mark_inode_dirty_sync(struct inode *inode) +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { - if (f2fs_inode_dirtied(inode)) + if (f2fs_inode_dirtied(inode, sync)) return; + mark_inode_dirty_sync(inode); } @@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ae29726afff0..7f2fdb154180 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -775,7 +775,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); - f2fs_mark_inode_dirty_sync(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -935,7 +935,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(old_dir); + f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -950,7 +950,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(new_dir); + f2fs_mark_inode_dirty_sync(new_dir, false); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 006138a6c5ab..23190a94840b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -620,24 +620,25 @@ static int f2fs_drop_inode(struct inode *inode) return generic_drop_inode(inode); } -int f2fs_inode_dirtied(struct inode *inode) +int f2fs_inode_dirtied(struct inode *inode, bool sync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; spin_lock(&sbi->inode_lock[DIRTY_META]); if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return 1; + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); } - - set_inode_flag(inode, FI_DIRTY_INODE); - list_add_tail(&F2FS_I(inode)->gdirty_list, + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, &sbi->inode_list[DIRTY_META]); - inc_page_count(sbi, F2FS_DIRTY_IMETA); - stat_inc_dirty_inode(sbi, DIRTY_META); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } spin_unlock(&sbi->inode_lock[DIRTY_META]); - - return 0; + return ret; } void f2fs_inode_synced(struct inode *inode) @@ -649,10 +650,12 @@ void f2fs_inode_synced(struct inode *inode) spin_unlock(&sbi->inode_lock[DIRTY_META]); return; } - list_del_init(&F2FS_I(inode)->gdirty_list); + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } clear_inode_flag(inode, FI_DIRTY_INODE); clear_inode_flag(inode, FI_AUTO_RECOVER); - dec_page_count(sbi, F2FS_DIRTY_IMETA); stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); spin_unlock(&sbi->inode_lock[DIRTY_META]); } @@ -676,7 +679,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); - f2fs_inode_dirtied(inode); + f2fs_inode_dirtied(inode, false); } static void f2fs_i_callback(struct rcu_head *head) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3a42405b6515..1c4d5e39586c 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -151,7 +151,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -594,7 +594,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: From a15f017e8a270f7c8d0f4d4599229a62454c0c7a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Oct 2016 18:27:56 -0700 Subject: [PATCH 1077/3220] f2fs: make clean inodes when flushing inode page commit 18340edc8da20b0d399eb25ba4bb631b27652f46 upstream. This patch tries to make more clean inodes when flushing dirty inodes in checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++++- fs/f2fs/inode.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1dffe86651be..ed79757c36e0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -924,7 +924,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); if (inode) { - update_inode_page(inode); + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + update_inode_page(inode); iput(inode); } }; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bfa512dde4ab..7b5e402f0a72 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -339,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode)) + if (update_inode_page(inode) && wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } From 3f137dda709e78201a92067f7f23b1747302dd9b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Oct 2016 19:09:57 -0700 Subject: [PATCH 1078/3220] f2fs: remove percpu_count due to performance regression commit 35782b233f37e48ecc469d9c7232f3f6a7fad41a upstream. This patch removes percpu_count usage due to performance regression in iozone. Fixes: 523be8a6b3 ("f2fs: use percpu_counter for page counters") Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 12 ++++++------ fs/f2fs/f2fs.h | 12 ++++++------ fs/f2fs/super.c | 16 +++++----------- 3 files changed, 17 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 5cab80dfa4dc..678733c78f55 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -313,17 +313,17 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", si->inmem_pages, si->wb_bios); - seq_printf(s, " - nodes: %4lld in %4d\n", + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); - seq_printf(s, " - datas: %4lld in files:%4d\n", + seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - meta: %4lld in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - imeta: %4lld\n", + seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0d2502fdf892..932c53f441db 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -872,7 +872,7 @@ struct f2fs_sb_info { atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ - struct percpu_counter nr_pages[NR_COUNT_TYPE]; + atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; @@ -1286,7 +1286,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_inc(&sbi->nr_pages[count_type]); + atomic_inc(&sbi->nr_pages[count_type]); if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) return; @@ -1303,7 +1303,7 @@ static inline void inode_inc_dirty_pages(struct inode *inode) static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_dec(&sbi->nr_pages[count_type]); + atomic_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1319,7 +1319,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); + return atomic_read(&sbi->nr_pages[count_type]); } static inline s64 get_dirty_pages(struct inode *inode) @@ -2239,8 +2239,8 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; - s64 inmem_pages; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 23190a94840b..6034d51fc5fc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -696,10 +696,6 @@ static void f2fs_destroy_inode(struct inode *inode) static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - int i; - - for (i = 0; i < NR_COUNT_TYPE; i++) - percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); } @@ -1450,6 +1446,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; + int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1474,6 +1471,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); @@ -1489,13 +1489,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) static int init_percpu_info(struct f2fs_sb_info *sbi) { - int i, err; - - for (i = 0; i < NR_COUNT_TYPE; i++) { - err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); - if (err) - return err; - } + int err; err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); if (err) From b3441f8c71f2e30b96a76ca007d51c2c2d6be354 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Nov 2016 14:52:15 +0100 Subject: [PATCH 1079/3220] f2fs: hide a maybe-uninitialized warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 230436b3ef3fd7d4a1da19edf5e87bb2d74e0fc2 upstream. gcc is unsure about the use of last_ofs_in_node, which might happen without a prior initialization: fs/f2fs//git/arm-soc/fs/f2fs/data.c: In function ‘f2fs_map_blocks’: fs/f2fs/data.c:799:54: warning: ‘last_ofs_in_node’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { As pointed out by Chao Yu, the code is actually correct as 'prealloc' is only set if the last_ofs_in_node has been set, the two always get updated together. This initializes last_ofs_in_node to dn.ofs_in_node for each new dnode at the start of the 'next_block' loop, which at that point is a correct initialization as well. I assume that compilers that correctly track the contents of the variables and do not warn about the condition also figure out that they can eliminate the extra assignment here. Fixes: 46008c6d4232 ("f2fs: support in batch multi blocks preallocation") Signed-off-by: Arnd Bergmann Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e92cf046d9ff..f03c34d3797f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -708,7 +708,7 @@ next_dnode: } prealloc = 0; - ofs_in_node = dn.ofs_in_node; + last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: From d4ec990d259694a05fc3cb38ef0b70f6159c3eb8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Dec 2016 10:44:44 -0800 Subject: [PATCH 1080/3220] fs/crypto: catch up 4.9-rc6 commit d117b9acaeada0b243f31e0fe83e111fcc9a6644 upstream. Merge tag 'ext4_for_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 Pull ext4 fixes from Ted Ts'o: "A security fix (so a maliciously corrupted file system image won't panic the kernel) and some fixes for CONFIG_VMAP_STACK" Signed-off-by: Jaegeuk Kim --- fs/crypto/crypto.c | 26 ++++---- fs/crypto/fname.c | 132 ++++++++++++++++++--------------------- fs/crypto/keyinfo.c | 85 ++++++++++++++++--------- fs/crypto/policy.c | 4 ++ fs/f2fs/dir.c | 6 +- fs/f2fs/namei.c | 6 +- include/linux/fscrypto.h | 24 ------- 7 files changed, 141 insertions(+), 142 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2fc8c43ce531..2d40ab9edc9f 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -28,7 +28,6 @@ #include #include #include -#include static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -128,11 +127,11 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) EXPORT_SYMBOL(fscrypt_get_ctx); /** - * fscrypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation + * page_crypt_complete() - completion callback for page crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void fscrypt_complete(struct crypto_async_request *req, int res) +static void page_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -152,7 +151,10 @@ static int do_page_crypto(struct inode *inode, struct page *src_page, struct page *dest_page, gfp_t gfp_flags) { - u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct { + __le64 index; + u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; + } xts_tweak; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -170,19 +172,17 @@ static int do_page_crypto(struct inode *inode, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fscrypt_complete, &ecr); + page_crypt_complete, &ecr); - BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - FS_XTS_TWEAK_SIZE - sizeof(index)); + BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); + xts_tweak.index = cpu_to_le64(index); + memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, PAGE_SIZE, 0); sg_init_table(&src, 1); sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, - xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 5d6d49113efa..9b774f4b50c8 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -10,21 +10,16 @@ * This has not yet undergone a rigorous security audit. */ -#include -#include #include #include #include -static u32 size_round_up(size_t size, size_t blksize) -{ - return ((size + blksize - 1) / blksize) * blksize; -} - /** - * dir_crypt_complete() - + * fname_crypt_complete() - completion callback for filename crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void dir_crypt_complete(struct crypto_async_request *req, int res) +static void fname_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -35,90 +30,80 @@ static void dir_crypt_complete(struct crypto_async_request *req, int res) } /** - * fname_encrypt() - + * fname_encrypt() - encrypt a filename * - * This function encrypts the input filename, and returns the length of the - * ciphertext. Errors are returned as negative numbers. We trust the caller to - * allocate sufficient memory to oname string. + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) { - u32 ciphertext_len; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - struct scatterlist src_sg, dst_sg; + struct scatterlist sg; int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim; + unsigned int lim; + unsigned int cryptlen; lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? - FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = size_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; + /* + * Copy the filename to the output buffer for encrypting in-place and + * pad it with the needed number of NUL bytes. + */ + cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE); + cryptlen = round_up(cryptlen, padding); + cryptlen = min(cryptlen, lim); + memcpy(oname->name, iname->name, iname->len); + memset(oname->name + iname->len, 0, cryptlen - iname->len); - if (ciphertext_len <= sizeof(buf)) { - workbuf = buf; - } else { - alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); - if (!alloc_buf) - return -ENOMEM; - workbuf = alloc_buf; - } + /* Initialize the IV */ + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); - /* Allocate request */ + /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", __func__); - kfree(alloc_buf); + "%s: skcipher_request_alloc() failed\n", __func__); return -ENOMEM; } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); + sg_init_one(&sg, oname->name, cryptlen); + skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - if (iname->len < ciphertext_len) - memset(workbuf + iname->len, 0, ciphertext_len - iname->len); - - /* Initialize IV */ - memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); - - /* Create encryption request */ - sg_init_one(&src_sg, workbuf, ciphertext_len); - sg_init_one(&dst_sg, oname->name, ciphertext_len); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + /* Do the encryption */ res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { + /* Request is being completed asynchronously; wait for it */ wait_for_completion(&ecr.completion); res = ecr.res; } - kfree(alloc_buf); skcipher_request_free(req); - if (res < 0) + if (res < 0) { printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); + return res; + } - oname->len = ciphertext_len; - return res; + oname->len = cryptlen; + return 0; } -/* - * fname_decrypt() - * This function decrypts the input filename, and returns - * the length of the plaintext. - * Errors are returned as negative numbers. - * We trust the caller to allocate sufficient memory to oname string. +/** + * fname_decrypt() - decrypt a filename + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_decrypt(struct inode *inode, const struct fscrypt_str *iname, @@ -146,7 +131,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -168,7 +153,7 @@ static int fname_decrypt(struct inode *inode, } oname->len = strnlen(oname->name, iname->len); - return oname->len; + return 0; } static const char *lookup_table = @@ -231,9 +216,8 @@ u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) if (ci) padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - if (ilen < FS_CRYPTO_BLOCK_SIZE) - ilen = FS_CRYPTO_BLOCK_SIZE; - return size_round_up(ilen, padding); + ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE); + return round_up(ilen, padding); } EXPORT_SYMBOL(fscrypt_fname_encrypted_size); @@ -279,6 +263,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, u32 hash, u32 minor_hash, @@ -287,13 +275,12 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; - int ret; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (iname->len < FS_CRYPTO_BLOCK_SIZE) @@ -303,9 +290,9 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, return fname_decrypt(inode, iname, oname); if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { - ret = digest_encode(iname->name, iname->len, oname->name); - oname->len = ret; - return ret; + oname->len = digest_encode(iname->name, iname->len, + oname->name); + return 0; } if (hash) { memcpy(buf, &hash, 4); @@ -315,15 +302,18 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; - ret = digest_encode(buf, 24, oname->name + 1); - oname->len = ret + 1; - return ret + 1; + oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, @@ -333,7 +323,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (inode->i_crypt_info) return fname_encrypt(inode, iname, oname); @@ -367,10 +357,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (dir->i_crypt_info) { ret = fscrypt_fname_alloc_buffer(dir, iname->len, &fname->crypto_buf); - if (ret < 0) + if (ret) return ret; ret = fname_encrypt(dir, iname, &fname->crypto_buf); - if (ret < 0) + if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 1ac263eddc4e..67fb6d8876d0 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -8,11 +8,8 @@ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. */ -#include #include -#include #include -#include #include static void derive_crypt_complete(struct crypto_async_request *req, int rc) @@ -139,6 +136,38 @@ out: return res; } +static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, + const char **cipher_str_ret, int *keysize_ret) +{ + if (S_ISREG(inode->i_mode)) { + if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { + *cipher_str_ret = "xts(aes)"; + *keysize_ret = FS_AES_256_XTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported contents encryption mode " + "%d for inode %lu\n", + ci->ci_data_mode, inode->i_ino); + return -ENOKEY; + } + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { + *cipher_str_ret = "cts(cbc(aes))"; + *keysize_ret = FS_AES_256_CTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported filenames encryption mode " + "%d for inode %lu\n", + ci->ci_filename_mode, inode->i_ino); + return -ENOKEY; + } + + pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", + (inode->i_mode & S_IFMT), inode->i_ino); + return -ENOKEY; +} + static void put_crypt_info(struct fscrypt_info *ci) { if (!ci) @@ -155,8 +184,8 @@ int get_crypt_info(struct inode *inode) struct fscrypt_context ctx; struct crypto_skcipher *ctfm; const char *cipher_str; - u8 raw_key[FS_MAX_KEY_SIZE]; - u8 mode; + int keysize; + u8 *raw_key = NULL; int res; res = fscrypt_initialize(); @@ -179,13 +208,19 @@ retry: if (res < 0) { if (!fscrypt_dummy_context_enabled(inode)) return res; + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; ctx.flags = 0; } else if (res != sizeof(ctx)) { return -EINVAL; } - res = 0; + + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + if (ctx.flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); if (!crypt_info) @@ -198,27 +233,20 @@ retry: crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "%s: unsupported key mode %d (ino %u)\n", - __func__, mode, (unsigned) inode->i_ino); - res = -ENOKEY; + res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); + if (res) goto out; - } + + /* + * This cannot be a stack buffer because it is passed to the scatterlist + * crypto API as part of key derivation. + */ + res = -ENOMEM; + raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS); + if (!raw_key) + goto out; + if (fscrypt_dummy_context_enabled(inode)) { memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; @@ -253,11 +281,12 @@ got_key: crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; - memzero_explicit(raw_key, sizeof(raw_key)); + kzfree(raw_key); + raw_key = NULL; if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { put_crypt_info(crypt_info); goto retry; @@ -268,7 +297,7 @@ out: if (res == -ENOKEY) res = 0; put_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); + kzfree(raw_key); return res; } diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ed115acb5dee..6865663aac69 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -109,6 +109,8 @@ int fscrypt_process_policy(struct file *filp, if (ret) return ret; + inode_lock(inode); + if (!inode_has_encryption_context(inode)) { if (!S_ISDIR(inode->i_mode)) ret = -EINVAL; @@ -127,6 +129,8 @@ int fscrypt_process_policy(struct file *filp, ret = -EINVAL; } + inode_unlock(inode); + mnt_drop_write_file(filp); return ret; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5594667c2f41..210082783d5a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -814,12 +814,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int ret; + int err; - ret = fscrypt_fname_disk_to_usr(d->inode, + err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); - if (ret < 0) + if (err) return true; de_name = *fstr; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7f2fdb154180..468b2dbe6d34 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -451,7 +451,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, ostr.name = sd->encrypted_path; ostr.len = disk_link.len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_out; sd->len = cpu_to_le16(ostr.len); @@ -1047,7 +1047,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook goto errout; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; /* this is broken symlink case */ @@ -1059,7 +1059,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook paddr = pstr.name; /* Null-terminate the name */ - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; put_page(cpage); return *cookie = paddr; diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index 76cff18bb032..ff8b11b26f31 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -111,23 +111,6 @@ struct fscrypt_completion_result { struct fscrypt_completion_result ecr = { \ COMPLETION_INITIALIZER((ecr).completion), 0 } -static inline int fscrypt_key_size(int mode) -{ - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - return FS_AES_256_XTS_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_GCM: - return FS_AES_256_GCM_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_CBC: - return FS_AES_256_CBC_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_CTS: - return FS_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - #define FS_FNAME_NUM_SCATTER_ENTRIES 4 #define FS_CRYPTO_BLOCK_SIZE 16 #define FS_FNAME_CRYPTO_DIGEST_SIZE 32 @@ -202,13 +185,6 @@ static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); } -static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size) -{ - if (size == fscrypt_key_size(mode)) - return size; - return 0; -} - static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') From 7d2eab1921a170ba724bdaf5247d76336ed5472e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Oct 2016 18:46:34 +0800 Subject: [PATCH 1081/3220] f2fs: report error of f2fs_fill_dentries commit ed6bd4b146527e7c6934e3582c47d7b857802676 upstream. Report error of f2fs_fill_dentries to ->iterate_shared, otherwise when error ocurrs, user may just list part of dirents in target directory without any hints. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 21 ++++++++++++--------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/inline.c | 6 ++++-- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 210082783d5a..4436079dbf0c 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -785,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; @@ -820,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return true; + return err; de_name = *fstr; fstr->len = save_len; @@ -828,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) - return true; + return 1; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return false; + return 0; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) @@ -872,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); - if (err == -ENOENT) + if (err == -ENOENT) { + err = 0; continue; - else + } else { goto out; + } } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + if (err) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; @@ -892,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } - err = 0; out: fscrypt_fname_free_buffer(&fstr); - return err; + return err < 0 ? err : 0; } static int f2fs_dir_open(struct inode *inode, struct file *filp) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 932c53f441db..4b13d70d716c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2014,7 +2014,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t); unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, +int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b859d5e377ae..b85987703d1e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -629,6 +629,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + int err; if (ctx->pos == NR_INLINE_DENTRY) return 0; @@ -641,11 +642,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); - if (!f2fs_fill_dentries(ctx, &d, 0, fstr)) + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) ctx->pos = NR_INLINE_DENTRY; f2fs_put_page(ipage, 1); - return 0; + return err < 0 ? err : 0; } int f2fs_inline_data_fiemap(struct inode *inode, From a91b9fe273685a11e958a38f373798847e44c41d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 31 Oct 2016 14:01:41 -0700 Subject: [PATCH 1082/3220] f2fs: avoid infinite loop in the EIO case on recover_orphan_inodes commit 099228000eff6b25e0f76b276043cd65cd4eba5a upstream. This patch should fix an infinite loop case below. F2FS-fs : inject IO error in f2fs_read_end_io+0xf3/0x120 [f2fs] F2FS-fs (nvme0n1p1): recover_orphan_inode: orphan failed (ino=39ac1a), run fsck to fix. ... [] sync_meta_pages+0xae/0x270 [f2fs] [] ? flush_sit_entries+0x8d/0x960 [f2fs] [] write_checkpoint+0x361/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] f2fs_balance_fs_bg+0x7e/0x1c0 [f2fs] [] f2fs_write_node_pages+0x34/0x320 [f2fs] [] do_writepages+0x21/0x30 [] __writeback_single_inode+0x61/0x760 [] ? _raw_spin_unlock+0x27/0x40 [] writeback_single_inode+0xd5/0x190 [] write_inode_now+0x99/0xc0 [] iput+0x1f6/0x2c0 [] f2fs_fill_super+0xe0e/0x1300 [f2fs] [] ? sget_userns+0x4f4/0x530 [] mount_bdev+0x182/0x1b0 [] ? f2fs_commit_super+0x100/0x100 [f2fs] [] f2fs_mount+0x15/0x20 [f2fs] [] mount_fs+0x38/0x170 [] vfs_kern_mount+0x6b/0x160 [] do_mount+0x1be/0xd60 [] ? copy_mount_options+0xb7/0x220 [] SyS_mount+0x94/0xd0 [] entry_SYSCALL_64_fastpath+0x23/0xc6 Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6034d51fc5fc..e007c011ec53 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1890,6 +1890,13 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); free_nm: From 97df49a0c31a5d14f1053ea8c737173e3f38653a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:44:59 +0900 Subject: [PATCH 1083/3220] f2fs: Add missing break in switch-case commit 487df616dec33231c99294b906d720d256a2de16 upstream. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e007c011ec53..4fd34e7bcf60 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -420,6 +420,7 @@ static int parse_options(struct super_block *sb, char *options) break; case Opt_nodiscard: clear_opt(sbi, DISCARD); + break; case Opt_noheap: set_opt(sbi, NOHEAP); break; From 22bbc1efdb5ecb7ad98e5085eb7b42830503f432 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:00 +0900 Subject: [PATCH 1084/3220] f2fs: Use generic zoned block device terminology commit 0bfd7a091c19132489a0f977b8dbf9f6b5ae0a1c upstream. SMR stands for "Shingled Magnetic Recording" which makes sense only for hard disk drives (spinning rust). The ZBC/ZAC standards enable management of SMR disks, but solid state drives may also support those standards. So rename the HMSMR feature to BLKZONED to avoid a HDD centric terminology. For the same reason, rename f2fs_sb_mounted_hmsmr to f2fs_sb_mounted_blkzoned. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f03c34d3797f..a60efb8fdd3a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -111,7 +111,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, { if (!is_read_io(rw)) { atomic_inc(&sbi->nr_wb_bios); - if (f2fs_sb_mounted_hmsmr(sbi->sb) && + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4b13d70d716c..5a563a9f3a52 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -103,7 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_HMSMR 0x0002 +#define F2FS_FEATURE_BLKZONED 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -2470,9 +2470,9 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); } -static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) { - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4fd34e7bcf60..3574d0620dc4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -974,7 +974,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + if (f2fs_sb_mounted_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { From 1529b8f943a1082e8be64d63c7206b9d72073e1a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:01 +0900 Subject: [PATCH 1085/3220] f2fs: Check zoned block feature for host-managed zoned block devices commit d1b959c8770260b611b9a1f0c5e8b12b7cb5b9d2 upstream. The F2FS_FEATURE_BLKZONED feature indicates that the drive was formatted with zone alignment optimization. This is optional for host-aware devices, but mandatory for host-managed zoned block devices. So check that the feature is set in this latter case. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3574d0620dc4..4187e3b9a83e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1639,6 +1639,26 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device support is not enabled\n"); + goto free_sb_buf; + } +#else + if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + goto free_sb_buf; + } +#endif + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); From ecc252e7a4d71b2c4197a70f085a3c2435ff8273 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:02 +0900 Subject: [PATCH 1086/3220] f2fs: Suppress discard warning message for zoned block devices commit 0ab0299835738cd407569401da1fef4c97b4419c upstream. For zoned block devices, discard is replaced by zone reset. So do not warn if the device does not supports discard. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4187e3b9a83e..3e57ec837de9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -412,7 +412,7 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else { + } else if (!f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); From 4b1d4ef0b7dd50ee42c1de77b03dd4039ee25672 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:03 +0900 Subject: [PATCH 1087/3220] f2fs: Always enable discard for zoned blocks devices commit 96ba2decb4241aa2c6b61cfc8489d648769eff99 upstream. Zone write pointer reset acts as discard for zoned block devices. So if the zoned block device feature is enabled, always declare that discard is enabled, even if the device does not actually support the command. For the same reason, prevent the use the "nodicard" mount option. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 14 +++++++------- fs/f2fs/super.c | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5a563a9f3a52..4fd31208965d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1159,13 +1159,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock(&sbi->cp_lock); } -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) -{ - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - - return blk_queue_discard(q); -} - static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -2475,6 +2468,13 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); +} + static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) { clear_opt(sbi, ADAPTIVE); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3e57ec837de9..33676c1e35d7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -419,6 +419,11 @@ static int parse_options(struct super_block *sb, char *options) } break; case Opt_nodiscard: + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "discard is required for zoned block devices"); + return -EINVAL; + } clear_opt(sbi, DISCARD); break; case Opt_noheap: From d9d8c376e4929feda317453b75bffd020a4fbd31 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:04 +0900 Subject: [PATCH 1088/3220] f2fs: Do not allow adaptive mode for host-managed zoned block devices commit 3adc57e97792e4ac9f228bde802829e2e9840afe upstream. The LFS mode is mandatory for host-managed zoned block devices as update in place optimizations are not possible for segments in sequential zones. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 33676c1e35d7..6bc0810969b7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -518,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "adaptive mode is not allowed with " + "zoned block device feature"); + kfree(name); + return -EINVAL; + } set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); } else if (strlen(name) == 3 && !strncmp(name, "lfs", 3)) { From 018fc18e28098712f19e23712ce24310ba0efdf5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:05 +0900 Subject: [PATCH 1089/3220] f2fs: Cache zoned block devices zone type commit 178053e2f1f9ccdb61ff6c2bd8644b53fc98e72e upstream. With the zoned block device feature enabled, section discard need to do a zone reset for sections contained in sequential zones, and a regular discard (if supported) for sections stored in conventional zones. Avoid the need for a costly report zones to obtain a section zone type when discarding it by caching the types of the device zones in the super block information. This cache is initialized at mount time for mounts with the zoned block device feature enabled. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 18 +++++++++++++ fs/f2fs/super.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4fd31208965d..c6dba704b0fe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -803,6 +803,14 @@ struct f2fs_sb_info { u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; u8 key_prefix_size; #endif + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ + u8 *blkz_type; /* Array of zones type */ +#endif + /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -2468,6 +2476,16 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +#ifdef CONFIG_BLK_DEV_ZONED +static inline int get_blkz_type(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + + return sbi->blkz_type[zno]; +} +#endif + static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6bc0810969b7..d777a18df958 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1512,6 +1512,65 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) GFP_KERNEL); } +#ifdef CONFIG_BLK_DEV_ZONED +static int init_blkz_info(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t sector = 0; + struct blk_zone *zones; + unsigned int i, nr_zones; + unsigned int n = 0; + int err = -EIO; + + if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + return 0; + + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); + sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; + if (nr_sectors & (bdev_zone_size(bdev) - 1)) + sbi->nr_blkz++; + + sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL); + if (!sbi->blkz_type) + return -ENOMEM; + +#define F2FS_REPORT_NR_ZONES 4096 + + zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), + GFP_KERNEL); + if (!zones) + return -ENOMEM; + + /* Get block zones type */ + while (zones && sector < nr_sectors) { + + nr_zones = F2FS_REPORT_NR_ZONES; + err = blkdev_report_zones(bdev, sector, + zones, &nr_zones, + GFP_KERNEL); + if (err) + break; + if (!nr_zones) { + err = -EIO; + break; + } + + for (i = 0; i < nr_zones; i++) { + sbi->blkz_type[n] = zones[i].type; + sector += zones[i].len; + n++; + } + } + + kfree(zones); + + return err; +} +#endif + /* * Read f2fs raw super block. * Because we have two copies of super block, so read both of them @@ -1758,6 +1817,15 @@ try_onemore: init_ino_entry_info(sbi); +#ifdef CONFIG_BLK_DEV_ZONED + err = init_blkz_info(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + goto free_blkz; + } +#endif + /* setup f2fs internal modules */ err = build_segment_manager(sbi); if (err) { @@ -1936,6 +2004,10 @@ free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); +#ifdef CONFIG_BLK_DEV_ZONED +free_blkz: + kfree(sbi->blkz_type); +#endif kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); From 0573aa06787e9f274bcce187687c0e64f0dd69d2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:06 +0900 Subject: [PATCH 1090/3220] f2fs: Reset sequential zones on zoned block devices commit f46e8809e88d113ba096bcfc772b5182ce00b941 upstream. When a zoned block device is mounted, discarding sections contained in sequential zones must reset the zone write pointer. For sections contained in conventional zones, the regular discard is used if the drive supports it. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ec4d74c26067..8e4863bd36f5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -584,6 +585,45 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +#ifdef CONFIG_BLK_DEV_ZONED +static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t sector = SECTOR_FROM_BLOCK(blkstart); + sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); + struct block_device *bdev = sbi->sb->s_bdev; + + if (nr_sects != bdev_zone_size(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Unaligned discard attempted (sector %llu + %llu)", + (unsigned long long)sector, + (unsigned long long)nr_sects); + return -EIO; + } + + /* + * We need to know the type of the zone: for conventional zones, + * use regular discard if the drive supports it. For sequential + * zones, reset the zone write pointer. + */ + switch (get_blkz_type(sbi, blkstart)) { + + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!blk_queue_discard(bdev_get_queue(bdev))) + return 0; + return blkdev_issue_discard(bdev, sector, nr_sects, + GFP_NOFS, 0); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + return blkdev_reset_zones(bdev, sector, + nr_sects, GFP_NOFS); + default: + /* Unknown zone type: broken device ? */ + return -EIO; + } +} +#endif + static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { @@ -601,6 +641,11 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, sbi->discard_blks--; } trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb)) + return f2fs_issue_discard_zone(sbi, blkstart, blklen); +#endif return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } From e96476298b9745df77f1a57d0bf93ff134b174b4 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:07 +0900 Subject: [PATCH 1091/3220] f2fs: Trace reset zone events commit 126606c7a99b32ba8265a51fab01533fe40c9ecc upstream. Similarly to the regular discard, trace zone reset events. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + include/trace/events/f2fs.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8e4863bd36f5..06b9d16a19f6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -615,6 +615,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, GFP_NOFS, 0); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: + trace_f2fs_issue_reset_zone(sbi->sb, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 3a09bb4dc3b2..90d6ad49a9c5 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1110,6 +1110,27 @@ TRACE_EVENT(f2fs_issue_discard, (unsigned long long)__entry->blklen) ); +TRACE_EVENT(f2fs_issue_reset_zone, + + TP_PROTO(struct super_block *sb, block_t blkstart), + + TP_ARGS(sb, blkstart), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(block_t, blkstart) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->blkstart = blkstart; + ), + + TP_printk("dev = (%d,%d), reset zone at block = 0x%llx", + show_dev(__entry), + (unsigned long long)__entry->blkstart) +); + TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct super_block *sb, unsigned int nobarrier, From c82c6b15a1b0775c6f72ad43a4ed79d37b9c5352 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Nov 2016 20:43:21 +0800 Subject: [PATCH 1092/3220] f2fs: record inode updating status correctly commit 60dcedc9972d136fab1600c919dd32080bcc26f7 upstream. We should record updating status of inode only for living inode, for those unlinked inode it needs to clear its ino cache, otherwise after the ino was been reused, it will cause unneeded node page writing during ->fsync. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 7b5e402f0a72..af06bda51a54 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -377,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #endif + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); @@ -409,10 +412,12 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); From b2942d459ad6e47c6bcdd55b8df0137f8841d29b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Nov 2016 00:26:55 +0800 Subject: [PATCH 1093/3220] f2fs: fix wrong i_atime recovery commit 9f0552e078b8a25fcf9c3950a05ea1398616d2b9 upstream. Shouldn't update in-memory i_atime with on-disk i_mtime of inode when recovering inode. Shuoran found this bug which is hidden for a long time, honour is belong to him. Signed-off-by: Shuoran Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2fc84a991325..d2ba4da08ec3 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -180,10 +180,10 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mode = le16_to_cpu(raw->i_mode); f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); From 0ec5bcd0fe77090d0a8b91012553ff8f85dfa216 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Nov 2016 14:33:57 -0700 Subject: [PATCH 1094/3220] f2fs: assign segments correctly for direct_io commit bdb7d964c4fb73078ab21c20e8c7b7bf7e641bb6 upstream. Previously, we assigned CURSEG_WARM_DATA for direct_io, but if we have two or four logs, we do not use that type at all. Let's fix it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 06b9d16a19f6..4bdf1191a36f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1422,8 +1422,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct curseg_info *curseg; bool direct_io = (type == CURSEG_DIRECT_IO); - type = direct_io ? CURSEG_WARM_DATA : type; - + if (direct_io) { + if (sbi->active_logs <= 4) + type = CURSEG_HOT_DATA; + else + type = CURSEG_WARM_DATA; + } curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); From 67609b128649f686d29047cff009ad28b01eac16 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Nov 2016 14:59:15 -0700 Subject: [PATCH 1095/3220] f2fs: remove checkpoint in f2fs_freeze commit b4b9d34c855ef383402cd1acefb1e33a515ae5f5 upstream. The generic freeze_super() calls sync_filesystems() before f2fs_freeze(). So, basically we don't need to do checkpoint in f2fs_freeze(). But, in xfs/068, it triggers circular locking problem below due to gc_mutex for checkpoint. ====================================================== [ INFO: possible circular locking dependency detected ] 4.9.0-rc1+ #132 Tainted: G OE ------------------------------------------------------- 1. wait for __sb_start_write() by [] dump_stack+0x85/0xc2 [] print_circular_bug+0x1cf/0x230 [] __lock_acquire+0x19e0/0x1bc0 [] lock_acquire+0x11b/0x220 [] ? f2fs_drop_inode+0x9b/0x160 [f2fs] [] __sb_start_write+0x130/0x200 [] ? f2fs_drop_inode+0x9b/0x160 [f2fs] [] f2fs_drop_inode+0x9b/0x160 [f2fs] [] iput+0x171/0x2c0 [] f2fs_sync_inode_meta+0x3f/0xf0 [f2fs] [] block_operations+0x84/0x110 [f2fs] [] write_checkpoint+0xe8/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] ? sched_clock+0x9/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] ? do_fsync+0x70/0x70 [] ? do_fsync+0x70/0x70 [] sync_fs_one_sb+0x20/0x30 [] iterate_supers+0xae/0x100 [] sys_sync+0x55/0x90 [] entry_SYSCALL_64_fastpath+0x23/0xc6 2. wait for sbi->gc_mutex by [] lock_acquire+0x11b/0x220 [] mutex_lock_nested+0x76/0x3f0 [] f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_freeze+0x1c/0x20 [f2fs] [] freeze_super+0xcf/0x190 [] do_vfs_ioctl+0x53c/0x6a0 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x23/0xc6 Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d777a18df958..7a0634b0bee8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -800,13 +800,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { - int err; - if (f2fs_readonly(sb)) return 0; - err = f2fs_sync_fs(sb, 1); - return err; + /* IO error happened before */ + if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + return -EINVAL; + return 0; } static int f2fs_unfreeze(struct super_block *sb) @@ -2153,3 +2157,4 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); + From e035bdfc8b8af673fc8bfe0c4b5aeff6f9d57a9d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 5 Nov 2016 11:12:40 +0800 Subject: [PATCH 1096/3220] Revert "f2fs: do not recover from previous remained wrong dnodes" commit d47b8715953ad0cf82bb0a9d30d7b11d83bc9c11 upstream. i_times of inode will be set with current system time which can be configured through 'date', so it's not safe to judge dnode block as garbage data or unchanged inode depend on i_times. Now, we have used enhanced 'cp_ver + cp' crc method to verify valid dnode block, so I expect recoverying invalid dnode is almost not possible. This reverts commit 807b1e1c8e08452948495b1a9985ab46d329e5c2. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d2ba4da08ec3..62523b217571 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -196,32 +196,6 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static bool is_same_inode(struct inode *inode, struct page *ipage) -{ - struct f2fs_inode *ri = F2FS_INODE(ipage); - struct timespec disk; - - if (!IS_INODE(ipage)) - return true; - - disk.tv_sec = le64_to_cpu(ri->i_ctime); - disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); - if (timespec_compare(&inode->i_ctime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_atime); - disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - if (timespec_compare(&inode->i_atime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_mtime); - disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); - if (timespec_compare(&inode->i_mtime, &disk) > 0) - return false; - - return true; -} - static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { struct curseg_info *curseg; @@ -248,10 +222,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (!is_same_inode(entry->inode, page)) - goto next; - } else { + if (!entry) { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) From 5065008a77ef2b08339b4cc1d1f11a645821dce3 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 7 Nov 2016 21:22:31 +0800 Subject: [PATCH 1097/3220] f2fs: return directly if block has been removed from the victim commit 206147112860418fa9363cf94ec2c172bdf4313a upstream. If one block has been to written to a new place, just return in move data process. This patch check it again with holding page lock. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 72a0ca08f901..744031194934 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -544,7 +544,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx) +static void move_encrypted_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -564,6 +565,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (!page) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -643,7 +647,8 @@ out: f2fs_put_page(page, 1); } -static void move_data_page(struct inode *inode, block_t bidx, int gc_type) +static void move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) { struct page *page; @@ -651,6 +656,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) if (IS_ERR(page)) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -792,9 +800,9 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx); + move_encrypted_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type); + move_data_page(inode, start_bidx, gc_type, segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); From 00e5a211f9c0e5f2672165c7ee12aadc467d8a2b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 12:31:40 -0800 Subject: [PATCH 1098/3220] f2fs: revert segment allocation for direct IO commit 6ae1be13e85f4c42c8ca371fda50ae39eebbfd96 upstream. Now we don't need to be too much careful about storage alignment for dio, since its speed becomes quite fast and we'd better avoid any misalignment first. Revert: 38aa0889b250 (f2fs: align direct_io'ed data to section) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +----- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 36 +++++++++--------------------------- 3 files changed, 10 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a60efb8fdd3a..beb747132858 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -583,7 +583,6 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; - int seg = CURSEG_WARM_DATA; pgoff_t fofs; blkcnt_t count = 1; @@ -601,11 +600,8 @@ alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) - seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, seg); + &sum, CURSEG_WARM_DATA); set_data_blkaddr(dn); /* update i_size */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c6dba704b0fe..4d4bfbaeb788 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -639,7 +639,6 @@ enum { CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NO_CHECK_TYPE, - CURSEG_DIRECT_IO, /* to use for the direct IO path */ }; struct flush_cmd { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4bdf1191a36f..19ab2e63d8d7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1267,25 +1267,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int old_segno; - - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); - locate_dirty_segment(sbi, old_segno); -} - void allocate_new_segments(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg; + unsigned int old_segno; int i; if (test_opt(sbi, LFS)) return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segments(sbi, i); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + curseg = CURSEG_I(sbi, i); + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } } static const struct segment_allocation default_salloc_ops = { @@ -1419,25 +1415,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); - struct curseg_info *curseg; - bool direct_io = (type == CURSEG_DIRECT_IO); - - if (direct_io) { - if (sbi->active_logs <= 4) - type = CURSEG_HOT_DATA; - else - type = CURSEG_WARM_DATA; - } - curseg = CURSEG_I(sbi, type); + struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); - /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0, 0)) - __allocate_new_segments(sbi, type); - *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* From f9baf967bddd70a2f3d67793a16ac51c640a3bdd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 12:08:22 -0800 Subject: [PATCH 1099/3220] f2fs: allow dio read for LFS mode commit e57e9ae5b179a6b243c42bf6d9549d1595c27089 upstream. We can allow dio reads for LFS mode, while doing buffered writes for dio writes. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index beb747132858..f59c91765b1d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1735,7 +1735,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; - if (test_opt(F2FS_I_SB(inode), LFS)) + if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) return 0; if (trace_android_fs_dataread_start_enabled() && From 07f010798cc238d227dad56cc52e956094914099 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Oct 2016 19:02:05 -0700 Subject: [PATCH 1100/3220] f2fs: support multiple devices commit 3c62be17d4f562f43fe1d03b48194399caa35aa5 upstream. This patch implements multiple devices support for f2fs. Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big volume under one f2fs instance. Internal block management is very simple, but we will modify block allocation and background GC policy to boost IO speed by exploiting them accoording to each device speed. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 55 ++++++++++++++-- fs/f2fs/f2fs.h | 29 +++++++-- fs/f2fs/segment.c | 112 +++++++++++++++++++++++--------- fs/f2fs/super.c | 138 +++++++++++++++++++++++++++++++--------- include/linux/f2fs_fs.h | 10 ++- 5 files changed, 274 insertions(+), 70 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f59c91765b1d..31e9c64e0f1d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -88,6 +88,46 @@ static void f2fs_write_end_io(struct bio *bio) bio_put(bio); } +/* + * Return true, if pre_bio's bdev is same as its target device. + */ +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + } + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static bool __same_bdev(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; +} + /* * Low-level block read/write IO operations. */ @@ -98,8 +138,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = f2fs_bio_alloc(npages); - bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; @@ -269,7 +308,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - io->fio.rw != fio->rw)) + (io->fio.rw != fio->rw) || + !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { @@ -956,7 +996,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct fscrypt_ctx *ctx = NULL; - struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { @@ -974,8 +1013,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, fscrypt_release_ctx(ctx); return ERR_PTR(-ENOMEM); } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; bio->bi_private = ctx; @@ -1069,7 +1107,8 @@ got_it: * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != block_nr - 1)) { + if (bio && (last_block_in_bio != block_nr - 1 || + !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; @@ -1737,6 +1776,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return 0; if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) return 0; + if (F2FS_I_SB(inode)->s_ndevs) + return 0; if (trace_android_fs_dataread_start_enabled() && (iov_iter_rw(iter) == READ)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d4bfbaeb788..04f6dddc6d91 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -756,6 +756,20 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct block_device *bdev; + char path[MAX_PATH_LEN]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + u8 *blkz_type; /* Array of zones type */ +#endif +}; + enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ @@ -804,10 +818,8 @@ struct f2fs_sb_info { #endif #ifdef CONFIG_BLK_DEV_ZONED - unsigned int nr_blkz; /* Total number of zones */ unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ - u8 *blkz_type; /* Array of zones type */ #endif /* for node-related operations */ @@ -924,6 +936,8 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -2190,6 +2204,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); +struct block_device *f2fs_target_device(struct f2fs_sb_info *, + block_t, struct bio *); +int f2fs_target_device_index(struct f2fs_sb_info *, block_t); void set_data_blkaddr(struct dnode_of_data *); void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); @@ -2477,11 +2494,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, - block_t blkaddr) + struct block_device *bdev, block_t blkaddr) { unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + int i; - return sbi->blkz_type[zno]; + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return FDEV(i).blkz_type[zno]; + return -EINVAL; } #endif diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 19ab2e63d8d7..30d0e9a76c62 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -401,6 +401,32 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } +static int __submit_flush_wait(struct block_device *bdev) +{ + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_bdev = bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_put(bio); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi) +{ + int ret = __submit_flush_wait(sbi->sb->s_bdev); + int i; + + if (sbi->s_ndevs && !ret) { + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(FDEV(i).bdev); + if (ret) + break; + } + } + return ret; +} + static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; @@ -411,24 +437,18 @@ repeat: return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio; struct flush_cmd *cmd, *next; int ret; - bio = f2fs_bio_alloc(0); - fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); - + ret = submit_flush_wait(sbi); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } - bio_put(bio); fcc->dispatch_list = NULL; } @@ -449,14 +469,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - struct bio *bio = f2fs_bio_alloc(0); int ret; atomic_inc(&fcc->submit_flush); - bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); + ret = submit_flush_wait(sbi); atomic_dec(&fcc->submit_flush); - bio_put(bio); return ret; } @@ -586,18 +603,24 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } #ifdef CONFIG_BLK_DEV_ZONED -static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, - block_t blkstart, block_t blklen) +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t sector = SECTOR_FROM_BLOCK(blkstart); sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - struct block_device *bdev = sbi->sb->s_bdev; + sector_t sector; + int devi = 0; - if (nr_sects != bdev_zone_size(bdev)) { + if (sbi->s_ndevs) { + devi = f2fs_target_device_index(sbi, blkstart); + blkstart -= FDEV(devi).start_blk; + } + sector = SECTOR_FROM_BLOCK(blkstart); + + if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, - "Unaligned discard attempted (sector %llu + %llu)", - (unsigned long long)sector, - (unsigned long long)nr_sects); + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); return -EIO; } @@ -606,7 +629,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, * use regular discard if the drive supports it. For sequential * zones, reset the zone write pointer. */ - switch (get_blkz_type(sbi, blkstart)) { + switch (get_blkz_type(sbi, bdev, blkstart)) { case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) @@ -625,29 +648,60 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } #endif -static int f2fs_issue_discard(struct f2fs_sb_info *sbi, - block_t blkstart, block_t blklen) +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t start = SECTOR_FROM_BLOCK(blkstart); sector_t len = SECTOR_FROM_BLOCK(blklen); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb) && + bdev_zoned_model(bdev) != BLK_ZONED_NONE) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + return blkdev_issue_discard(bdev, start, len, GFP_NOFS, 0); +} + +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = blkstart, len = 0; + struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } - for (i = blkstart; i < blkstart + blklen; i++) { se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); -#ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sbi->sb)) - return f2fs_issue_discard_zone(sbi, blkstart, blklen); -#endif - return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + return err; } static void __add_discard_entry(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7a0634b0bee8..2d332a16de71 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) percpu_counter_destroy(&sbi->total_valid_inode_count); } +static void destroy_device_list(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + blkdev_put(FDEV(i).bdev, FMODE_EXCL); +#ifdef CONFIG_BLK_DEV_ZONED + kfree(FDEV(i).blkz_type); +#endif + } + kfree(sbi->devs); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + destroy_device_list(sbi); + destroy_percpu_info(sbi); kfree(sbi); } @@ -1517,9 +1532,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_BLK_DEV_ZONED -static int init_blkz_info(struct f2fs_sb_info *sbi) +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { - struct block_device *bdev = sbi->sb->s_bdev; + struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev->bd_part->nr_sects; sector_t sector = 0; struct blk_zone *zones; @@ -1530,15 +1545,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi) if (!f2fs_sb_mounted_blkzoned(sbi->sb)) return 0; + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + return -EINVAL; sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != + __ilog2_u32(sbi->blocks_per_blkz)) + return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); - sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> - sbi->log_blocks_per_blkz; + FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; if (nr_sectors & (bdev_zone_size(bdev) - 1)) - sbi->nr_blkz++; + FDEV(devi).nr_blkz++; - sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL); - if (!sbi->blkz_type) + FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + if (!FDEV(devi).blkz_type) return -ENOMEM; #define F2FS_REPORT_NR_ZONES 4096 @@ -1563,7 +1584,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi) } for (i = 0; i < nr_zones; i++) { - sbi->blkz_type[n] = zones[i].type; + FDEV(devi).blkz_type[n] = zones[i].type; sector += zones[i].len; n++; } @@ -1667,6 +1688,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int i; + + for (i = 0; i < MAX_DEVICES; i++) { + if (!RDEV(i).path[0]) + return 0; + + if (i == 0) { + sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * + MAX_DEVICES, GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + } + + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + if (IS_ERR(FDEV(i).bdev)) + return PTR_ERR(FDEV(i).bdev); + + /* to release errored devices */ + sbi->s_ndevs = i + 1; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + return -EINVAL; + } + if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (init_blkz_info(sbi, i)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk, + bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? + "Host-aware" : "Host-managed"); + continue; + } +#endif + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + return 0; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -1725,15 +1817,7 @@ try_onemore: "Zoned block device support is not enabled\n"); goto free_sb_buf; } -#else - if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM && - !f2fs_sb_mounted_blkzoned(sb)) { - f2fs_msg(sb, KERN_ERR, - "Zoned block device feature not enabled\n"); - goto free_sb_buf; - } #endif - default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1803,6 +1887,13 @@ try_onemore: goto free_meta_inode; } + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to find devices"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -1821,15 +1912,6 @@ try_onemore: init_ino_entry_info(sbi); -#ifdef CONFIG_BLK_DEV_ZONED - err = init_blkz_info(sbi); - if (err) { - f2fs_msg(sb, KERN_ERR, - "Failed to initialize F2FS blkzone information"); - goto free_blkz; - } -#endif - /* setup f2fs internal modules */ err = build_segment_manager(sbi); if (err) { @@ -2008,10 +2090,8 @@ free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); -#ifdef CONFIG_BLK_DEV_ZONED -free_blkz: - kfree(sbi->blkz_type); -#endif +free_devices: + destroy_device_list(sbi); kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index e46e7d10312b..3e5972ef5019 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -52,10 +52,17 @@ #define VERSION_LEN 256 #define MAX_VOLUME_NAME 512 +#define MAX_PATH_LEN 64 +#define MAX_DEVICES 8 /* * For superblock */ +struct f2fs_device { + __u8 path[MAX_PATH_LEN]; + __le32 total_segments; +} __packed; + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -94,7 +101,8 @@ struct f2fs_super_block { __le32 feature; /* defined features */ __u8 encryption_level; /* versioning level for encryption */ __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ - __u8 reserved[871]; /* valid reserved region */ + struct f2fs_device devs[MAX_DEVICES]; /* device list */ + __u8 reserved[327]; /* valid reserved region */ } __packed; /* From bd8e41540a703c57994fd4398753dbb9b6679d3e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 16:31:56 -0800 Subject: [PATCH 1101/3220] f2fs: use err for f2fs_preallocate_blocks commit a7de608691f766cd148971a71d4f13aa1692d4c8 upstream. This patch has no functional change. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 26 +++++++++++++------------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 35 +++++++++++++++++++---------------- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 31e9c64e0f1d..87e07665832e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -653,11 +653,11 @@ alloc: return 0; } -ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; - ssize_t ret = 0; + int err = 0; map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); @@ -669,19 +669,19 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } if (!f2fs_has_inline_data(inode)) return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - return ret; + return err; } /* @@ -858,19 +858,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { struct f2fs_map_blocks map; - int ret; + int err; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; - ret = f2fs_map_blocks(inode, &map, create, flag); - if (!ret) { + err = f2fs_map_blocks(inode, &map, create, flag); + if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; bh->b_size = map.m_len << inode->i_blkbits; } - return ret; + return err; } static int get_data_block(struct inode *inode, sector_t iblock, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 04f6dddc6d91..8dc378d82f67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2212,7 +2212,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); -ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); +int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ce38a350fb38..fbfcd809baec 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1324,15 +1324,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; - int ret; + int err; - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) - return ret; + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; f2fs_balance_fs(sbi, true); @@ -1344,12 +1344,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (off_end) map.m_len++; - ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - if (ret) { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (err) { pgoff_t last_off; if (!map.m_len) - return ret; + return err; last_off = map.m_lblk + map.m_len - 1; @@ -1363,7 +1363,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); - return ret; + return err; } static long f2fs_fallocate(struct file *file, int mode, @@ -2267,12 +2267,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - ret = f2fs_preallocate_blocks(iocb, from); - if (!ret) { - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); + int err = f2fs_preallocate_blocks(iocb, from); + + if (err) { + inode_unlock(inode); + return err; } + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); } inode_unlock(inode); From 8b2c7581e85f6b3464763952ef54c82691b71c4e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 16:46:40 -0800 Subject: [PATCH 1102/3220] f2fs: fix redundant block allocation commit c040ff9d69fd1d782fe577ba9e35c1f5798158ae upstream. In direct_IO path of f2fs_file_write_iter(), 1. f2fs_preallocate_blocks(F2FS_GET_BLOCK_PRE_DIO) -> allocate LBA X 2. f2fs_direct_IO() -> return 0; Then, f2fs_write_data_page() will allocate another LBA X+1. This makes EIO triggered by HM-SMR. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 87e07665832e..ed1ac24ba7c4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -653,6 +653,13 @@ alloc: return 0; } +static inline bool __force_buffered_io(struct inode *inode, int rw) +{ + return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -672,7 +679,10 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) err = f2fs_convert_inline_inode(inode); if (err) return err; - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + return f2fs_map_blocks(inode, &map, 1, + __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { err = f2fs_convert_inline_inode(inode); @@ -1772,11 +1782,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (err) return err; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) - return 0; - if (F2FS_I_SB(inode)->s_ndevs) + if (__force_buffered_io(inode, rw)) return 0; if (trace_android_fs_dataread_start_enabled() && From 79ba046b115eaa4c0f30104bccb6f41ae710e952 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Nov 2016 17:38:35 -0800 Subject: [PATCH 1103/3220] f2fs: avoid BG_GC in f2fs_balance_fs commit 7702bdbe505a22380dd958e2ee35124c7c414806 upstream. If many threads hit has_not_enough_free_secs() in f2fs_balance_fs() at the same time, all the threads would do FG_GC or BG_GC. In this critical path, we totally don't need to do BG_GC at all. Let's avoid that. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 7 +++++-- fs/f2fs/segment.c | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8dc378d82f67..687ab43a6cd8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2235,7 +2235,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *, int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool); +int f2fs_gc(struct f2fs_sb_info *, bool, bool); void build_gc_manager(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fbfcd809baec..84f4572ae959 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1853,7 +1853,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync, true); out: mnt_drop_write_file(filp); return ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 744031194934..54d06c21af07 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -82,7 +82,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -905,7 +905,7 @@ next: return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) { unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; @@ -946,6 +946,9 @@ gc_more: if (ret) goto stop; } + } else if (gc_type == BG_GC && !background) { + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + goto stop; } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 30d0e9a76c62..27e1b7c56e4c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -364,7 +364,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false); + f2fs_gc(sbi, false, false); } } From df3f20f12b9fcf4c93706ac24edcb6a3ff9f58a5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Nov 2016 18:20:10 -0800 Subject: [PATCH 1104/3220] f2fs: fix wrong written_valid_blocks counting commit c79b7ff1d3c7710c23d8828a69d8cbc5597ad19f upstream. Previously, written_valid_blocks was got by ckpt->valid_block_count. But if the last checkpoint has some NEW_ADDR due to power-cut, we can get wrong value. Fix it to get the number from actual written block count from sit entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 27e1b7c56e4c..58cae4a541a7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2176,7 +2176,6 @@ out: static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *dst_bitmap; @@ -2243,7 +2242,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; - sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); + sit_i->written_valid_blocks = 0; sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; @@ -2397,6 +2396,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) struct seg_entry *sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; } /* set use the current segments */ From 9e266223b33dac867e29e7d0393e363ff86e092a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Nov 2016 10:41:20 +0800 Subject: [PATCH 1105/3220] f2fs: don't wait writeback for datas during checkpoint commit 36951b38d13ac7cce9fcf89e0e01c22ed0d05688 upstream. Normally, while committing checkpoint, we will wait on all pages to be writebacked no matter the page is data or metadata, so in scenario where there are lots of data IO being submitted with metadata, we may suffer long latency for waiting writeback during checkpoint. Indeed, we only care about persistence for pages with metadata, but not pages with data, as file system consistent are only related to metadate, so in order to avoid encountering long latency in above scenario, let's recognize and reference metadata in submitted IOs, wait writeback only for metadatas. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 35 +++++++++++++++++++++++++++++------ fs/f2fs/debug.c | 7 ++++--- fs/f2fs/f2fs.h | 9 ++++++--- fs/f2fs/file.c | 2 -- fs/f2fs/gc.c | 2 -- fs/f2fs/segment.c | 1 - 7 files changed, 40 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ed79757c36e0..889317e07122 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1005,7 +1005,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&sbi->nr_wb_bios)) + if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; io_schedule_timeout(5*HZ); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ed1ac24ba7c4..fa254daac970 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -30,6 +30,26 @@ #include #include +static bool __is_cp_guaranteed(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (!mapping) + return false; + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode) || + is_cold_data(page)) + return true; + return false; +} + static void f2fs_read_end_io(struct bio *bio) { struct bio_vec *bvec; @@ -72,6 +92,7 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + enum count_type type = WB_DATA_TYPE(page); fscrypt_pullback_bio_page(&page, true); @@ -79,9 +100,11 @@ static void f2fs_write_end_io(struct bio *bio) set_bit(AS_EIO, &page->mapping->flags); f2fs_stop_checkpoint(sbi, true); } + dec_page_count(sbi, type); + clear_cold_data(page); end_page_writeback(page); } - if (atomic_dec_and_test(&sbi->nr_wb_bios) && + if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); @@ -149,7 +172,6 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, struct bio *bio, enum page_type type) { if (!is_read_io(rw)) { - atomic_inc(&sbi->nr_wb_bios); if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); @@ -305,6 +327,11 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) verify_block_addr(sbi, fio->old_blkaddr); verify_block_addr(sbi, fio->new_blkaddr); + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + if (!is_read) + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || @@ -318,8 +345,6 @@ alloc_new: io->fio = *fio; } - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); @@ -1331,7 +1356,6 @@ done: if (err && err != -ENOENT) goto redirty_out; - clear_cold_data(page); out: inode_dec_dirty_pages(inode); if (err) @@ -1745,7 +1769,6 @@ static int f2fs_write_end(struct file *file, goto unlock_out; set_page_dirty(page); - clear_cold_data(page); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 678733c78f55..fbd5184140d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_bios = atomic_read(&sbi->nr_wb_bios); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -313,8 +314,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", - si->inmem_pages, si->wb_bios); + seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", + si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 687ab43a6cd8..d6119ea3b86d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -702,6 +702,7 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ +#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -709,6 +710,8 @@ enum count_type { F2FS_DIRTY_META, F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, NR_COUNT_TYPE, }; @@ -888,7 +891,6 @@ struct f2fs_sb_info { block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; @@ -1302,7 +1304,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2261,7 +2264,7 @@ struct f2fs_stat_info { unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, wb_bios; + int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84f4572ae959..bab65f0a5bb5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -95,8 +95,6 @@ mapped: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); - /* if gced page is attached, don't write to cold segment */ - clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 54d06c21af07..6390d45c1b68 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -690,8 +690,6 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - - clear_cold_data(page); } out: f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 58cae4a541a7..23e8892c4e60 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -288,7 +288,6 @@ static int __commit_inmem_pages(struct inode *inode, /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - clear_cold_data(page); submit_bio = true; } unlock_page(page); From beaab6afb4c86fd7daa47577220d597cdc183a46 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 16 Nov 2016 17:26:24 +0800 Subject: [PATCH 1106/3220] f2fs: fix an infinite loop when flush nodes in cp commit d40a43af0a57a017eba9ad2679183791587ceb6a upstream. Thread A Thread B - write_checkpoint - block_operations -blk_start_plug -sync_node_pages - f2fs_do_sync_file - fsync_node_pages - f2fs_wait_on_page_writeback Thread A wait for global F2FS_DIRTY_NODES decreased to zero, it start a plug list, some requests have been added to this list. Thread B lock one dirty node page, and wait this page write back. But this page has been in plug list of thread A with PG_writeback flag. Thread A keep on running and its plug list has no chance to finish, so it seems a deadlock between cp and fsync path. This patch add a wait on page write back before set node page dirty to avoid this problem. Signed-off-by: Yunlei He Signed-off-by: Pengyang Hou Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 389be7f6e07c..59cc29e6b73c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1409,6 +1409,7 @@ continue_unlock: "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_page->index); lock_page(last_page); + f2fs_wait_on_page_writeback(last_page, NODE, true); set_page_dirty(last_page); unlock_page(last_page); goto retry; From 031017c6f9922c9c7b7819e876531de3f7ecb065 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Nov 2016 20:53:11 +0800 Subject: [PATCH 1107/3220] f2fs: fix to account total free nid correctly commit 04d47e673863c637a2b44ad34a558aeb5d0a727e upstream. Thread A Thread B Thread C - f2fs_create - f2fs_new_inode - f2fs_lock_op - alloc_nid alloc last nid - f2fs_unlock_op - f2fs_create - f2fs_new_inode - f2fs_lock_op - alloc_nid as node count still not be increased, we will loop in alloc_nid - f2fs_write_node_pages - f2fs_balance_fs_bg - f2fs_sync_fs - write_checkpoint - block_operations - f2fs_lock_all - f2fs_lock_op While creating new inode, we do not allocate and account nid atomically, so that when there is almost no free nids left, we may encounter deadloop like above stack. In order to avoid that, reuse nm_i::available_nids for accounting free nids and make nid allocation and counting being atomical during node creation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 34 +++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d6119ea3b86d..973ca74404de 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -560,7 +560,7 @@ enum nid_list { struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t available_nids; /* maximum available node ids */ + nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 59cc29e6b73c..edacbabb92cf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1885,11 +1885,13 @@ retry: if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; #endif - if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) - return false; - spin_lock(&nm_i->nid_list_lock); + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } + /* We should not use stale free nids created by build_free_nids */ if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); @@ -1900,6 +1902,7 @@ retry: __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + nm_i->available_nids--; spin_unlock(&nm_i->nid_list_lock); return true; } @@ -1951,6 +1954,9 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i->state = NID_NEW; __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } + + nm_i->available_nids++; + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2150,6 +2156,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + __set_nat_cache_dirty(nm_i, ne); } update_nats_in_cursum(journal, -i); @@ -2222,8 +2241,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - if (nat_get_blkaddr(ne) == NULL_ADDR) + if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + NM_I(sbi)->available_nids++; + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } if (to_journal) @@ -2298,7 +2321,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID_LIST] = 0; nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; From 8f5fcb8034c939bb9e6c03fe748a08df85a8503a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Nov 2016 20:53:31 +0800 Subject: [PATCH 1108/3220] f2fs: fix fdatasync commit 281518c694a5228d6c46fac83529fb3e2c331281 upstream. For below two cases, we can't guarantee data consistence: a) 1. xfs_io "pwrite 0 4195328" "fsync" 2. xfs_io "pwrite 4195328 1024" "fdatasync" 3. godown 4. umount & mount --> isize we updated before fdatasync won't be recovered b) 1. xfs_io "pwrite -S 0xcc 0 4202496" "fsync" 2. xfs_io "fpunch 4194304 4096" "fdatasync" 3. godown 4. umount & mount --> dnode we punched before fdatasync won't be recovered The reason is that normally fdatasync won't be aware of modification of metadata in file, e.g. isize changing, dnode updating, so in ->fsync we will skip flushing node pages for above cases, result in making fdatasynced file being lost during recovery. Currently we have introduced DIRTY_META global list in sbi for tracking dirty inode selectively, so in fdatasync we can choose to flush nodes depend on dirty state of current inode in the list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++++++- fs/f2fs/file.c | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 973ca74404de..16bedd87022d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1763,8 +1763,17 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode) +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) return false; return F2FS_I(inode)->last_disk_size == i_size_read(inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bab65f0a5bb5..7fd8e7cffe9b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -209,7 +209,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } /* if the inode is dirty, let's recover all the time */ - if (!datasync && !f2fs_skip_inode_update(inode)) { + if (!f2fs_skip_inode_update(inode, datasync)) { f2fs_write_inode(inode, NULL); goto go_write; } From 7ab9a6acd086ecdc9a5e86785b7abb3076118ba1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Nov 2016 15:09:48 -0800 Subject: [PATCH 1109/3220] f2fs: do not recover i_size if it's valid commit 3a3a5ead7b6d2c9a29f493791ba23f264052db34 upstream. If i_size is already valid during roll_forward recovery, we should not update it according to the block alignment. Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 62523b217571..687c176f0b56 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -425,7 +425,7 @@ retry_dn: continue; } - if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + if (i_size_read(inode) <= (start << PAGE_SHIFT)) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* From 7ce8cbc7faa97d91231383dc662316226e2dcd15 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Nov 2016 18:53:16 -0800 Subject: [PATCH 1110/3220] f2fs: fix wrong AUTO_RECOVER condition commit 97dd26ad834739d4e4ea35fd7ab5f92824de4cbb upstream. If i_size is not aligned to the f2fs's block size, we should not skip inode update during fsync. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 16bedd87022d..fa66e5baa58a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1774,7 +1774,8 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) spin_unlock(&sbi->inode_lock[DIRTY_META]); return ret; } - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + i_size_read(inode) & PAGE_MASK) return false; return F2FS_I(inode)->last_disk_size == i_size_read(inode); } From 23eb5177568d70646669871d2268106947630990 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 18 Nov 2016 22:21:13 +0800 Subject: [PATCH 1111/3220] f2fs: drop duplicate header timer.h commit b4ceec29219e340178baa9c5f17bf97a42951cc8 upstream. Drop duplicate header timer.h from segment.c. Signed-off-by: Geliang Tang Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 23e8892c4e60..ba715d60c738 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "f2fs.h" #include "segment.h" From a680707a3f4161761e35effaedbdba02ee7ee069 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Nov 2016 22:27:41 +0800 Subject: [PATCH 1112/3220] f2fs: fix incorrect free inode count in ->statfs commit b08b12d2ddc85b977a0531470cf6a7158289aaaf upstream. While calculating inode count that we can create at most in the left space, we should consider space which data/node blocks occupied, since we create data/node mixly in main area. So fix the wrong calculation in ->statfs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2d332a16de71..a288456c17c0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -852,7 +852,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = buf->f_files - valid_inode_count(sbi); + buf->f_ffree = min(buf->f_files - valid_node_count(sbi), + buf->f_bavail); buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; From 55342cce61558f36270ba7056f29260b2e769d7e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 22 Nov 2016 15:20:16 +0100 Subject: [PATCH 1113/3220] f2fs: fix 32-bit build commit 19c526515f6b998039d5d71fea879d255f173746 upstream. The addition of multiple-device support broke CONFIG_BLK_DEV_ZONED on 32-bit machines because of a 64-bit division: fs/f2fs/f2fs.o: In function `__issue_discard_async': extent_cache.c:(.text.__issue_discard_async+0xd4): undefined reference to `__aeabi_uldivmod' Fortunately, bdev_zone_size() is guaranteed to return a power-of-two number, so we can replace the % operator with a cheaper bit mask. Fixes: 792b84b74b54 ("f2fs: support multiple devices") Signed-off-by: Arnd Bergmann Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ba715d60c738..4f557e5e789d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -614,7 +614,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } sector = SECTOR_FROM_BLOCK(blkstart); - if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_size(bdev) - 1) || + nr_sects != bdev_zone_size(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", From 9f495d826bf3dc4894f3cb4ef61af77102a38d9a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 28 Nov 2016 15:33:38 -0800 Subject: [PATCH 1114/3220] f2fs: do not activate auto_recovery for fallocated i_size commit 26787236b36660baf4d136281d40b5bb33a570ec upstream. If a file needs to keep its i_size by fallocate, we need to turn off auto recovery during roll-forward recovery. This will resolve the below scenario. 1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 4096" -c "fsync" 2. xfs_io -f /mnt/f2fs/file -c "falloc -k 4096 4096" -c "fsync" 3. md5sum /mnt/f2fs/file; 4. godown /mnt/f2fs/ 5. umount /mnt/f2fs/ 6. mount -t f2fs /dev/sdx /mnt/f2fs 7. md5sum /mnt/f2fs/file Reported-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 38 +++++++++++++++++++++----------------- fs/f2fs/file.c | 2 ++ fs/f2fs/recovery.c | 11 ++++++++--- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fa66e5baa58a..4f1046be0d74 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -449,6 +449,7 @@ struct f2fs_map_blocks { #define FADVISE_LOST_PINO_BIT 0x02 #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -461,6 +462,8 @@ struct f2fs_map_blocks { #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) #define DEF_DIR_LEVEL 0 @@ -1763,23 +1766,6 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) -{ - if (dsync) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool ret; - - spin_lock(&sbi->inode_lock[DIRTY_META]); - ret = list_empty(&F2FS_I(inode)->gdirty_list); - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return ret; - } - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || - i_size_read(inode) & PAGE_MASK) - return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); -} - static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; @@ -1932,6 +1918,24 @@ static inline void clear_file(struct inode *inode, int type) f2fs_mark_inode_dirty_sync(inode, true); } +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & PAGE_MASK) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7fd8e7cffe9b..57b6dbcbbd88 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1403,6 +1403,8 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 687c176f0b56..981a9584b62f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -187,6 +187,8 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + F2FS_I(inode)->i_advise = raw->i_advise; + if (file_enc_name(inode)) name = ""; else @@ -425,7 +427,8 @@ retry_dn: continue; } - if (i_size_read(inode) <= (start << PAGE_SHIFT)) + if (!file_keep_isize(inode) && + (i_size_read(inode) <= (start << PAGE_SHIFT))) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* @@ -478,8 +481,10 @@ err: f2fs_put_dnode(&dn); out: f2fs_msg(sbi->sb, KERN_NOTICE, - "recover_data: ino = %lx, recovered = %d blocks, err = %d", - inode->i_ino, recovered, err); + "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", + inode->i_ino, + file_keep_isize(inode) ? "keep" : "recover", + recovered, err); return err; } From 9a82dd23e414b084ed191d8506b99cb85f401028 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 Nov 2016 19:13:43 -0800 Subject: [PATCH 1115/3220] f2fs: return AOP_WRITEPAGE_ACTIVATE for writepage commit 0002b61bdaac732bcff364a18f5bd57c95def0a5 upstream. We should use AOP_WRITEPAGE_ACTIVATE when we bypass writing pages. Signed-off-by: Chao Yu Signed-off-by: Miao Xie Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fa254daac970..283fc9de4762 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1376,6 +1376,8 @@ out: redirty_out: redirty_page_for_writepage(wbc, page); + if (!err) + return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; } @@ -1471,6 +1473,15 @@ continue_unlock: ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + continue; + } done_index = page->index + 1; done = 1; break; From 036ed1b8ebbbe2db084f83bb8e03d83a0d3d6ee6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Dec 2016 15:11:32 -0800 Subject: [PATCH 1116/3220] Revert "f2fs: use percpu_counter for # of dirty pages in inode" commit 204706c7accfabb67b97eef9f9a28361b6201199 upstream. This reverts commit 1beba1b3a953107c3ff5448ab4e4297db4619c76. The perpcu_counter doesn't provide atomicity in single core and consume more DRAM. That incurs fs_mark test failure due to ENOMEM. Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/file.c | 2 +- fs/f2fs/super.c | 7 +------ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4f1046be0d74..c7eb2ff398ce 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -479,7 +479,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - struct percpu_counter dirty_pages; /* # of dirty pages */ + atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ @@ -1316,7 +1316,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { - percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1332,7 +1332,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1342,9 +1342,9 @@ static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } -static inline s64 get_dirty_pages(struct inode *inode) +static inline int get_dirty_pages(struct inode *inode) { - return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); + return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 57b6dbcbbd88..5c0500813efe 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1537,7 +1537,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, - "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a288456c17c0..ce09191891f8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -571,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); - if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { - kmem_cache_free(f2fs_inode_cachep, fi); - return NULL; - } - /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; + atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); @@ -703,7 +699,6 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { - percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } From d1c2c35718474b52d19a58acffe7a698d6dc764c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 11:37:14 -0800 Subject: [PATCH 1117/3220] f2fs: call sync_fs when f2fs is idle commit f455c8a5f0a24090e99249eb7280012376adec2c upstream. The sync_fs in f2fs_balance_fs_bg must avoid interrupting current user requests. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4f557e5e789d..b95f07559d90 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -381,12 +381,15 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else build_free_nids(sbi, false); + if (!is_idle(sbi)) + return; + /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || !available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || - (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; From 5b80a5e2bef9d5291c832cd367dd038eeaa753d4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 13:56:04 -0800 Subject: [PATCH 1118/3220] f2fs: detect wrong layout commit 2040fce83fe17763b07c97c1f691da2bb85e4135 upstream. Previous mkfs.f2fs allows small partition inappropriately, so f2fs should detect that as well. Refer this in f2fs-tools. mkfs.f2fs: detect small partition by overprovision ratio and # of segments Reported-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 ++ fs/f2fs/super.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 89ab4301ef02..9d44ce83acb2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,6 +18,8 @@ #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ce09191891f8..07f4ba444733 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1453,6 +1453,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1464,6 +1465,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong layout: check mkfs.f2fs version"); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From 80ea4ddbb01c6e1f89432bf6544dcebb2e691cf2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 17:25:32 -0800 Subject: [PATCH 1119/3220] f2fs: free meta pages if sanity check for ckpt is failed commit a2125ff7dd1ed3a2a53cdc1f8f9c9cec9cfaa7ab upstream. This fixes missing freeing meta pages in the error case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 889317e07122..640f28576e88 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -768,7 +768,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) /* Sanity checking of checkpoint */ if (sanity_check_ckpt(sbi)) - goto fail_no_cp; + goto free_fail_no_cp; if (cur_page == cp1) sbi->cur_cp_pack = 1; @@ -796,6 +796,9 @@ done: f2fs_put_page(cp2, 1); return 0; +free_fail_no_cp: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); fail_no_cp: kfree(sbi->ckpt); return -EINVAL; From be4b8492172a2f44774ac81384e60cdc396a6d6f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Dec 2016 16:23:32 -0800 Subject: [PATCH 1120/3220] f2fs: fix to access nullified flush_cmd_control pointer commit 5eba8c5d1fb3af28b2073ba5228d4998196c1bcc upstream. f2fs_sync_file() remount_ro - f2fs_readonly - destroy_flush_cmd_control - f2fs_issue_flush - no fcc pointer! So, this patch doesn't free fcc in this case, but just stop its kernel thread which sends flush commands. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 33 +++++++++++++++++++++++++-------- fs/f2fs/super.c | 5 +++-- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c7eb2ff398ce..3ef2d93ab936 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2150,7 +2150,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b95f07559d90..a288de069164 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -486,8 +486,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); - wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); + } else { + llist_del_all(&fcc->issue_list); + atomic_set(&fcc->submit_flush, 0); + } return cmd.ret; } @@ -498,6 +503,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; + if (SM_I(sbi)->cmd_control_info) { + fcc = SM_I(sbi)->cmd_control_info; + goto init_thread; + } + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; @@ -505,6 +515,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; +init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { @@ -517,14 +528,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; - if (fcc && fcc->f2fs_issue_flush) - kthread_stop(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->cmd_control_info = NULL; + } } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -2658,7 +2675,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; - destroy_flush_cmd_control(sbi); + destroy_flush_cmd_control(sbi, true); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 07f4ba444733..e6d8d011786c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1103,8 +1103,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if flush_merge is not passed in mount option. */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { - destroy_flush_cmd_control(sbi); - } else if (!SM_I(sbi)->cmd_control_info) { + clear_opt(sbi, FLUSH_MERGE); + destroy_flush_cmd_control(sbi, false); + } else { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; From 02dcb93a1d1707c7776e92cf302a33178916ce2a Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Sun, 11 Dec 2016 15:35:15 +0800 Subject: [PATCH 1121/3220] f2fs: fix a missing size change in f2fs_setattr commit c0ed4405a99ec9be2a0f062eaafc002d8d26c99f upstream. This patch fix a missing size change in f2fs_setattr Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c0500813efe..5808d5c709a7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -678,6 +678,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; + bool size_changed = false; err = inode_change_ok(inode, attr); if (err) @@ -708,6 +709,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } inode->i_mtime = inode->i_ctime = current_time(inode); } + + size_changed = true; } __setattr_copy(inode, attr); @@ -720,8 +723,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - /* update attributes only */ - f2fs_mark_inode_dirty_sync(inode, false); + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, size_changed); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); From a886cc1d3a501af4c867c3728d4a5594fc80f4c7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 3 Aug 2017 13:28:22 +0200 Subject: [PATCH 1122/3220] UPSTREAM: ipv6: fib: Unlink replaced routes from their nodes When a route is deleted its node pointer is set to NULL to indicate it's no longer linked to its node. Do the same for routes that are replaced. This will later allow us to test if a route is still in the FIB by checking its node pointer instead of its reference count. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller Cherry-pick from: 7483cea79957312e9f8e9cf760a1bc5d6c507113 Bug: 64978549 Change-Id: Ibfa54cf918084138b6b19437e9ef86bfaea5deae --- net/ipv6/ip6_fib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c23e02a7ccb0..bf3824b59597 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -909,6 +909,7 @@ add: fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->rt6i_nsiblings; + iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (fn->rr_ptr == iter) fn->rr_ptr = NULL; @@ -923,6 +924,7 @@ add: break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->dst.rt6_next; + iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (fn->rr_ptr == iter) fn->rr_ptr = NULL; From 642da1dadeb8bdbeab8acd138afbdc041a723ae9 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 25 Sep 2017 08:55:09 -0700 Subject: [PATCH 1123/3220] FROMLIST: binder: fix use-after-free in binder_transaction() (from https://patchwork.kernel.org/patch/9978801/) User-space normally keeps the node alive when creating a transaction since it has a reference to the target. The local strong ref keeps it alive if the sending process dies before the target process processes the transaction. If the source process is malicious or has a reference counting bug, this can fail. In this case, when we attempt to decrement the node in the failure path, the node has already been freed. This is fixed by taking a tmpref on the node while constructing the transaction. To avoid re-acquiring the node lock and inner proc lock to increment the proc's tmpref, a helper is used that does the ref increments on both the node and proc. Bug: 66899329 Change-Id: Iad40e1e0bccee88234900494fb52a510a37fe8d7 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 95 ++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 28 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 7c9f9dde8397..32a2b2f44691 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2744,6 +2744,48 @@ static bool binder_proc_transaction(struct binder_transaction *t, return true; } +/** + * binder_get_node_refs_for_txn() - Get required refs on node for txn + * @node: struct binder_node for which to get refs + * @proc: returns @node->proc if valid + * @error: if no @proc then returns BR_DEAD_REPLY + * + * User-space normally keeps the node alive when creating a transaction + * since it has a reference to the target. The local strong ref keeps it + * alive if the sending process dies before the target process processes + * the transaction. If the source process is malicious or has a reference + * counting bug, relying on the local strong ref can fail. + * + * Since user-space can cause the local strong ref to go away, we also take + * a tmpref on the node to ensure it survives while we are constructing + * the transaction. We also need a tmpref on the proc while we are + * constructing the transaction, so we take that here as well. + * + * Return: The target_node with refs taken or NULL if no @node->proc is NULL. + * Also sets @proc if valid. If the @node->proc is NULL indicating that the + * target proc has died, @error is set to BR_DEAD_REPLY + */ +static struct binder_node *binder_get_node_refs_for_txn( + struct binder_node *node, + struct binder_proc **procp, + uint32_t *error) +{ + struct binder_node *target_node = NULL; + + binder_node_inner_lock(node); + if (node->proc) { + target_node = node; + binder_inc_node_nilocked(node, 1, 0, NULL); + binder_inc_node_tmpref_ilocked(node); + node->proc->tmp_ref++; + *procp = node->proc; + } else + *error = BR_DEAD_REPLY; + binder_node_inner_unlock(node); + + return target_node; +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, @@ -2846,43 +2888,35 @@ static void binder_transaction(struct binder_proc *proc, ref = binder_get_ref_olocked(proc, tr->target.handle, true); if (ref) { - binder_inc_node(ref->node, 1, 0, NULL); - target_node = ref->node; + target_node = binder_get_node_refs_for_txn( + ref->node, &target_proc, + &return_error); + } else { + binder_user_error("%d:%d got transaction to invalid handle\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; } binder_proc_unlock(proc); - if (target_node == NULL) { - binder_user_error("%d:%d got transaction to invalid handle\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - return_error_param = -EINVAL; - return_error_line = __LINE__; - goto err_invalid_target_handle; - } } else { mutex_lock(&context->context_mgr_node_lock); target_node = context->binder_context_mgr_node; - if (target_node == NULL) { + if (target_node) + target_node = binder_get_node_refs_for_txn( + target_node, &target_proc, + &return_error); + else return_error = BR_DEAD_REPLY; - mutex_unlock(&context->context_mgr_node_lock); - return_error_line = __LINE__; - goto err_no_context_mgr_node; - } - binder_inc_node(target_node, 1, 0, NULL); mutex_unlock(&context->context_mgr_node_lock); } - e->to_node = target_node->debug_id; - binder_node_lock(target_node); - target_proc = target_node->proc; - if (target_proc == NULL) { - binder_node_unlock(target_node); - return_error = BR_DEAD_REPLY; + if (!target_node) { + /* + * return_error is set above + */ + return_error_param = -EINVAL; return_error_line = __LINE__; goto err_dead_binder; } - binder_inner_proc_lock(target_proc); - target_proc->tmp_ref++; - binder_inner_proc_unlock(target_proc); - binder_node_unlock(target_node); + e->to_node = target_node->debug_id; if (security_binder_transaction(proc->tsk, target_proc->tsk) < 0) { return_error = BR_FAILED_REPLY; @@ -3241,6 +3275,8 @@ static void binder_transaction(struct binder_proc *proc, if (target_thread) binder_thread_dec_tmpref(target_thread); binder_proc_dec_tmpref(target_proc); + if (target_node) + binder_dec_node_tmpref(target_node); /* * write barrier to synchronize with initialization * of log entry @@ -3260,6 +3296,8 @@ err_bad_parent: err_copy_data_failed: trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, t->buffer, offp); + if (target_node) + binder_dec_node_tmpref(target_node); target_node = NULL; t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); @@ -3274,13 +3312,14 @@ err_bad_call_stack: err_empty_call_stack: err_dead_binder: err_invalid_target_handle: -err_no_context_mgr_node: if (target_thread) binder_thread_dec_tmpref(target_thread); if (target_proc) binder_proc_dec_tmpref(target_proc); - if (target_node) + if (target_node) { binder_dec_node(target_node, 1, 0); + binder_dec_node_tmpref(target_node); + } binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n", From 6b1f845ef3460e721da7aeeecc37e41229285f85 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 09:55:38 -0800 Subject: [PATCH 1124/3220] f2fs: remove wrong backported codes Kconfig and dentry RCU mode fixes. Fixes: c1286ff41c2f610df ("f2fs: backport from 'for-f2fs-4.9'") Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 1 - fs/f2fs/namei.c | 3 --- 2 files changed, 4 deletions(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 1852d99df97b..378c221d68a9 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -2,7 +2,6 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK select CRYPTO - select KEYS select CRYPTO_CRC32 help F2FS is based on Log-structured File System (LFS), which supports diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 468b2dbe6d34..523bf073642e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1014,9 +1014,6 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook u32 max_size = inode->i_sb->s_blocksize; int res; - if (!dentry) - return ERR_PTR(-ECHILD); - res = fscrypt_get_encryption_info(inode); if (res) return ERR_PTR(res); From dc45fd9e28345dccdfef354d4ba16e7d406f3301 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 18:24:54 -0800 Subject: [PATCH 1125/3220] f2fs: resolve op and op_flags confilcts commit 70fd76140a6cb63262bd47b68d57b42e889c10ee upstream. This patch backported ("block,fs: use REQ_* flags directly") Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 +++++--- fs/f2fs/data.c | 54 ++++++++++++++++++++----------------- fs/f2fs/f2fs.h | 24 +++++++++++++++-- fs/f2fs/gc.c | 12 ++++++--- fs/f2fs/inline.c | 3 ++- fs/f2fs/node.c | 12 +++++---- fs/f2fs/segment.c | 9 ++++--- fs/f2fs/trace.c | 7 ++--- include/trace/events/f2fs.h | 19 ++++++++----- 9 files changed, 98 insertions(+), 53 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 640f28576e88..2ed785e5ffbb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -64,14 +64,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_READ, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, }; if (unlikely(!is_meta)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { @@ -158,13 +159,15 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, + .op = REQ_OP_READ, + .op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) : + REQ_RAHEAD, .encrypted_page = NULL, }; struct blk_plug plug; if (unlikely(type == META_POR)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 283fc9de4762..2eddf1daf995 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -168,15 +168,15 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } -static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, - struct bio *bio, enum page_type type) +static inline void __submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) { - if (!is_read_io(rw)) { + if (!is_read_io(bio_op(bio))) { if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } - submit_bio(rw, bio); + submit_bio(0, bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) @@ -186,12 +186,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->rw)) + if (is_read_io(fio->op)) trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - __submit_bio(io->sbi, fio->rw, io->bio, fio->type); + bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } @@ -257,10 +259,10 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - if (test_opt(sbi, NOBARRIER)) - io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; - else - io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + io->fio.op = REQ_OP_WRITE; + io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; + if (!test_opt(sbi, NOBARRIER)) + io->fio.op_flags |= REQ_FUA; } __submit_merged_bio(io); out: @@ -302,14 +304,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op)); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } + bio_set_op_attrs(bio, fio->op, fio->op_flags); - __submit_bio(fio->sbi, fio->rw, bio, fio->type); + __submit_bio(fio->sbi, bio, fio->type); return 0; } @@ -318,7 +321,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->rw); + bool is_read = is_read_io(fio->op); struct page *bio_page; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -335,7 +338,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - (io->fio.rw != fio->rw) || + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: @@ -463,7 +466,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *get_read_data_page(struct inode *inode, pgoff_t index, - int rw, bool for_write) + int op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -473,7 +476,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .encrypted_page = NULL, }; @@ -541,7 +545,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC, false); + page = get_read_data_page(inode, index, REQ_SYNC, false); if (IS_ERR(page)) return page; @@ -567,7 +571,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC, for_write); + page = get_read_data_page(inode, index, REQ_SYNC, for_write); if (IS_ERR(page)) return page; @@ -1145,7 +1149,7 @@ got_it: if (bio && (last_block_in_bio != block_nr - 1 || !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { @@ -1154,6 +1158,7 @@ submit_and_realloc: bio = NULL; goto set_error_page; } + bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1168,7 +1173,7 @@ set_error_page: goto next_page; confused: if (bio) { - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } unlock_page(page); @@ -1178,7 +1183,7 @@ next_page: } BUG_ON(pages && !list_empty(pages)); if (bio) - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -1296,7 +1301,8 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; @@ -1728,14 +1734,14 @@ repeat: err = PTR_ERR(bio); goto fail; } - + bio->bi_rw = READ_SYNC; if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); err = -EFAULT; goto fail; } - __submit_bio(sbi, READ_SYNC, bio, DATA); + __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3ef2d93ab936..d0c7decdd3ac 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -112,6 +113,24 @@ struct f2fs_mount_info { #define F2FS_CLEAR_FEATURE(sb, mask) \ F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) +/* bio stuffs */ +#define REQ_OP_READ READ +#define REQ_OP_WRITE WRITE +#define bio_op(bio) ((bio)->bi_rw & 1) + +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_rw = op | op_flags; +} + +static inline int wbc_to_write_flags(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL) + return REQ_SYNC; + return 0; +} + /** * wq_has_sleeper - check if there are any waiting processes * @wq: wait queue head @@ -746,14 +765,15 @@ enum page_type { struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ - int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ + int op; /* contains REQ_OP_ */ + int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ }; -#define is_read_io(rw) (((rw) & 1) == READ) +#define is_read_io(rw) (rw == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6390d45c1b68..d3a36e4b442c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -550,7 +550,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = READ_SYNC, + .op = REQ_OP_READ, + .op_flags = REQ_SYNC, .encrypted_page = NULL, }; struct dnode_of_data dn; @@ -627,7 +628,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - fio.rw = WRITE_SYNC; + fio.op = REQ_OP_WRITE; + fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); @@ -668,7 +670,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = WRITE_SYNC, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE, .page = page, .encrypted_page = NULL, }; @@ -767,7 +770,8 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA, true); + start_bidx + ofs_in_node, REQ_RAHEAD, + true); if (IS_ERR(data_page)) { iput(inode); continue; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b85987703d1e..fb5d7d1f34aa 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -128,7 +128,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index edacbabb92cf..26a745c544fc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1068,14 +1068,15 @@ fail: * 0: f2fs_put_page(page, 0) * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ -static int read_node_page(struct page *page, int rw) +static int read_node_page(struct page *page, int op_flags) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .page = page, .encrypted_page = NULL, }; @@ -1116,7 +1117,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (!apage) return; - err = read_node_page(apage, READA); + err = read_node_page(apage, REQ_RAHEAD); f2fs_put_page(apage, err ? 1 : 0); } @@ -1134,7 +1135,7 @@ repeat: if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); + err = read_node_page(page, REQ_SYNC); if (err < 0) { f2fs_put_page(page, 1); return ERR_PTR(err); @@ -1575,7 +1576,8 @@ static int f2fs_write_node_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a288de069164..70aec4a8de13 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -257,7 +257,8 @@ static int __commit_inmem_pages(struct inode *inode, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .encrypted_page = NULL, }; bool submit_bio = false; @@ -407,6 +408,7 @@ static int __submit_flush_wait(struct block_device *bdev) struct bio *bio = f2fs_bio_alloc(0); int ret; + bio->bi_rw = REQ_OP_WRITE; bio->bi_bdev = bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); bio_put(bio); @@ -1544,7 +1546,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = WRITE_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, @@ -1552,7 +1555,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; set_page_writeback(page); f2fs_submit_page_mbio(&fio); diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 562ce0821559..73b4e1d1912a 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -25,11 +25,11 @@ static inline void __print_last_io(void) if (!last_io.len) return; - trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n", + trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", last_io.major, last_io.minor, last_io.pid, "----------------", last_io.type, - last_io.fio.rw, + last_io.fio.op, last_io.fio.op_flags, last_io.fio.new_blkaddr, last_io.len); memset(&last_io, 0, sizeof(last_io)); @@ -101,7 +101,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) if (last_io.major == major && last_io.minor == minor && last_io.pid == pid && last_io.type == __file_type(inode, pid) && - last_io.fio.rw == fio->rw && + last_io.fio.op == fio->op && + last_io.fio.op_flags == fio->op_flags && last_io.fio.new_blkaddr + last_io.len == fio->new_blkaddr) { last_io.len++; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 90d6ad49a9c5..7ad46e8a89e6 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -59,7 +59,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD); #define F2FS_BIO_MASK(t) (t & (READA | WRITE_FLUSH_FUA)) #define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) -#define show_bio_type(type) show_bio_base(type), show_bio_extra(type) +#define show_bio_type(op, op_flags) \ + show_bio_base((op|op_flags)), show_bio_extra((op|op_flags)) #define show_bio_base(type) \ __print_symbolic(F2FS_BIO_MASK(type), \ @@ -734,7 +735,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(pgoff_t, index) __field(block_t, old_blkaddr) __field(block_t, new_blkaddr) - __field(int, rw) + __field(int, op) + __field(int, op_flags) __field(int, type) ), @@ -744,7 +746,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->index = page->index; __entry->old_blkaddr = fio->old_blkaddr; __entry->new_blkaddr = fio->new_blkaddr; - __entry->rw = fio->rw; + __entry->op = fio->op; + __entry->op_flags = fio->op_flags; __entry->type = fio->type; ), @@ -754,7 +757,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, - show_bio_type(__entry->rw), + show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type)) ); @@ -785,7 +788,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, rw) + __field(int, op) + __field(int, op_flags) __field(int, type) __field(sector_t, sector) __field(unsigned int, size) @@ -793,7 +797,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->rw = fio->rw; + __entry->op = fio->op; + __entry->op_flags = fio->op_flags; __entry->type = fio->type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; @@ -801,7 +806,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u", show_dev(__entry), - show_bio_type(__entry->rw), + show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, __entry->size) From 401c465b81a78fb2a531d37a63804a4eb8ee2027 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 16:41:25 -0800 Subject: [PATCH 1126/3220] f2fs: support async discard based on v4.9 commit 275b66b09e85cf0520dc610dd89706952751a473 upstream. This patch is based on commit 275b66b09e85 (f2fs: support async discard). Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 +- fs/f2fs/f2fs.h | 3 +- fs/f2fs/segment.c | 183 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 181 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2ed785e5ffbb..d485bea3d6bb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1255,6 +1255,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, prefree_segments(sbi)); flush_sit_entries(sbi, cpc); clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); unblock_operations(sbi); goto out; } @@ -1273,10 +1274,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - if (err) + if (err) { release_discard_addrs(sbi); - else + } else { clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0c7decdd3ac..883d3ab388c1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -127,7 +127,7 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op, static inline int wbc_to_write_flags(struct writeback_control *wbc) { if (wbc->sync_mode == WB_SYNC_ALL) - return REQ_SYNC; + return REQ_SYNC | REQ_NOIDLE; return 0; } @@ -2174,6 +2174,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 70aec4a8de13..13bea6e5120e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,6 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *bio_entry_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -622,6 +623,162 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, + struct bio *bio) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + + INIT_LIST_HEAD(&be->list); + be->bio = bio; + init_completion(&be->event); + list_add_tail(&be->list, wait_list); + + return be; +} + +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be, *tmp; + + list_for_each_entry_safe(be, tmp, wait_list, list) { + struct bio *bio = be->bio; + int err; + + wait_for_completion_io(&be->event); + err = be->error; + if (err == -EOPNOTSUPP) + err = 0; + + if (err) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", err); + + bio_put(bio); + list_del(&be->list); + kmem_cache_free(bio_entry_slab, be); + } +} + +static void f2fs_submit_bio_wait_endio(struct bio *bio) +{ + struct bio_entry *be = (struct bio_entry *)bio->bi_private; + + be->error = bio->bi_error; + complete(&be->event); +} + +/* copied from block/blk-lib.c in 4.10-rc1 */ +static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, int flags, + struct bio **biop) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = *biop; + unsigned int granularity; + int op = REQ_WRITE | REQ_DISCARD; + int alignment; + sector_t bs_mask; + + if (!q) + return -ENXIO; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (flags & BLKDEV_DISCARD_SECURE) { + if (!blk_queue_secdiscard(q)) + return -EOPNOTSUPP; + op |= REQ_SECURE; + } + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + while (nr_sects) { + unsigned int req_sects; + sector_t end_sect, tmp; + + /* Make sure bi_size doesn't overflow */ + req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9); + + /** + * If splitting a request, and the next starting sector would be + * misaligned, stop the discard at the previous aligned sector. + */ + end_sect = sector + req_sects; + tmp = end_sect; + if (req_sects < nr_sects && + sector_div(tmp, granularity) != alignment) { + end_sect = end_sect - alignment; + sector_div(end_sect, granularity); + end_sect = end_sect * granularity + alignment; + req_sects = end_sect - sector; + } + + if (bio) { + int ret = submit_bio_wait(0, bio); + bio_put(bio); + if (ret) + return ret; + } + bio = f2fs_bio_alloc(0); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, op, 0); + + bio->bi_iter.bi_size = req_sects << 9; + nr_sects -= req_sects; + sector = end_sect; + + /* + * We can loop for a long time in here, if someone does + * full device discards (like mkfs). Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); + } + + *biop = bio; + return 0; +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + struct bio *bio = NULL; + int err; + + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(blkstart), + SECTOR_FROM_BLOCK(blklen), + GFP_NOFS, 0, &bio); + if (!err && bio) { + struct bio_entry *be = __add_bio_entry(sbi, bio); + + bio->bi_private = be; + bio->bi_end_io = f2fs_submit_bio_wait_endio; + submit_bio(REQ_SYNC, bio); + } + + return err; +} + #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) @@ -655,8 +812,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return blkdev_issue_discard(bdev, sector, nr_sects, - GFP_NOFS, 0); + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: trace_f2fs_issue_reset_zone(sbi->sb, blkstart); @@ -672,15 +828,12 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t start = SECTOR_FROM_BLOCK(blkstart); - sector_t len = SECTOR_FROM_BLOCK(blklen); - #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_mounted_blkzoned(sbi->sb) && bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return blkdev_issue_discard(bdev, start, len, GFP_NOFS, 0); + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -720,8 +873,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, if (len) err = __issue_discard_async(sbi, bdev, start, len); - - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); return err; } @@ -822,11 +973,14 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct list_head *head = &(SM_I(sbi)->discard_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason == CP_DISCARD); + blk_start_plug(&plug); + mutex_lock(&dirty_i->seglist_lock); while (1) { @@ -875,6 +1029,8 @@ skip: SM_I(sbi)->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } + + blk_finish_plug(&plug); } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -2551,6 +2707,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; INIT_LIST_HEAD(&sm_info->discard_list); + INIT_LIST_HEAD(&sm_info->wait_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; @@ -2694,10 +2851,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; + bio_entry_slab = f2fs_kmem_cache_create("bio_entry", + sizeof(struct bio_entry)); + if (!bio_entry_slab) + goto destroy_discard_entry; + sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destroy_discard_entry; + goto destroy_bio_entry; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2707,6 +2869,8 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); +destroy_bio_entry: + kmem_cache_destroy(bio_entry_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: @@ -2716,6 +2880,7 @@ fail: void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); + kmem_cache_destroy(bio_entry_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } From 2ed473dc9155c4e031cefdb75c6edfc3d4a42817 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 13 Dec 2016 17:23:37 +0800 Subject: [PATCH 1127/3220] f2fs: remove unused values in recover_fsync_data commit fed24668482e07421b8e746a4886e7725434050a upstream. This patch remove unused values in function recover_fsync_data Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 983c35da6bce..4a3c48c24c10 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -552,10 +552,8 @@ next: int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; struct list_head dir_list; - block_t blkaddr; int err; int ret = 0; bool need_writecp = false; @@ -571,8 +569,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); - blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); if (err || list_empty(&inode_list)) From 1b05b5e173183cfdfc8791c34f04e8de2edb42f7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Dec 2016 18:54:59 +0800 Subject: [PATCH 1128/3220] f2fs: don't cache nat entry if out of memory commit 5c9e418436f3445d7cc4f3ba2964f231a4b33f17 upstream. If we run out of memory, in cache_nat_entry, it's better to avoid loop for allocating memory to cache nat entry, so in low memory scenario, for read path of node block, I expect this can avoid unneeded latency. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 26a745c544fc..b01b01cfc39e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -245,12 +245,24 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) return need_update; } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) +static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, + bool no_fail) { struct nat_entry *new; - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + if (no_fail) { + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + } else { + new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + if (!new) + return NULL; + if (radix_tree_insert(&nm_i->nat_root, nid, new)) { + kmem_cache_free(nat_entry_slab, new); + return NULL; + } + } + memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); nat_reset_flag(new); @@ -267,8 +279,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = __lookup_nat_cache(nm_i, nid); if (!e) { - e = grab_nat_entry(nm_i, nid); - node_info_from_raw_nat(&e->ni, ne); + e = grab_nat_entry(nm_i, nid, false); + if (e) + node_info_from_raw_nat(&e->ni, ne); } else { f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != @@ -286,7 +299,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { - e = grab_nat_entry(nm_i, ni->nid); + e = grab_nat_entry(nm_i, ni->nid, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -2155,7 +2168,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = grab_nat_entry(nm_i, nid); + ne = grab_nat_entry(nm_i, nid, true); node_info_from_raw_nat(&ne->ni, &raw_ne); } From a1c90b43fc83873b86b4637ba7293bea5d1b7eec Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 16 Dec 2016 11:18:15 +0300 Subject: [PATCH 1129/3220] f2fs: remove unneeded condition commit 07fe8d44409f88be8f4a4e8f22b47ee709a22657 upstream. We checked that "inode" is not an error pointer earlier so there is no need to check again here. Signed-off-by: Dan Carpenter Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 523bf073642e..ca9e2f85eae8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -321,9 +321,9 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (err) goto err_out; } - if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) && - (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - !fscrypt_has_permitted_context(dir, inode)) { + if (f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { bool nokey = f2fs_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode); err = nokey ? -ENOKEY : -EPERM; From 75487d02a75b16fc40c9640459f09604185cd77b Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 19 Dec 2016 20:10:48 +0800 Subject: [PATCH 1130/3220] f2fs: fix a problem of using memory after free commit 7855eba4d6102f811b6dd142d6c749f53b591fa3 upstream. This patch fix a problem of using memory after free in function __try_merge_extent_node. Fixes: 0f825ee6e873 ("f2fs: add new interfaces for extent tree") Cc: Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 4db44da7ef69..e02c3d88dc9a 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -352,11 +352,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) - __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (en) + __release_extent_node(sbi, et, prev_ex); + en = next_ex; } From ff9199293b056f202f447e07bb4160581ae48cac Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 20 Dec 2016 11:11:35 +0800 Subject: [PATCH 1131/3220] f2fs: add a case of no need to read a page in write begin commit 746e2403927efbd7c7f2e796314e3cfb3cfabaa4 upstream. If the range we write cover the whole valid data in the last page, we do not need to read it. Signed-off-by: Yunlei He [Jaegeuk Kim: nullify the remaining area (fix: xfstests/f2fs/001)] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2eddf1daf995..d7c20880e53e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1723,6 +1723,11 @@ repeat: if (len == PAGE_SIZE || PageUptodate(page)) return 0; + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) { + zero_user_segment(page, len, PAGE_SIZE); + return 0; + } + if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); @@ -1777,7 +1782,7 @@ static int f2fs_write_end(struct file *file, * let generic_perform_write() try to copy data again through copied=0. */ if (!PageUptodate(page)) { - if (unlikely(copied != PAGE_SIZE)) + if (unlikely(copied != len)) copied = 0; else SetPageUptodate(page); From 574da11960668d46dc8fb2d36050bf1aecf782ea Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 20 Dec 2016 21:57:42 +0800 Subject: [PATCH 1132/3220] f2fs: use rb_entry_safe commit ed0b56209fe79a1309653d4b03f5c3147f580f6b upstream. Use rb_entry_safe() instead of open-coding it. Signed-off-by: Geliang Tang Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index e02c3d88dc9a..6ed6424807b6 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -311,28 +311,24 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, tmp_node = parent; if (parent && fofs > en->ei.fofs) tmp_node = rb_next(parent); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); tmp_node = parent; if (parent && fofs < en->ei.fofs) tmp_node = rb_prev(parent); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); return NULL; lookup_neighbors: if (fofs == en->ei.fofs) { /* lookup prev node for merging backward later */ tmp_node = rb_prev(&en->rb_node); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); } if (fofs == en->ei.fofs + en->ei.len - 1) { /* lookup next node for merging frontward later */ tmp_node = rb_next(&en->rb_node); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); } return en; } @@ -493,9 +489,8 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (!next_en) { struct rb_node *node = rb_next(&en->rb_node); - next_en = node ? - rb_entry(node, struct extent_node, rb_node) - : NULL; + next_en = rb_entry_safe(node, struct extent_node, + rb_node); } if (parts) From c70e14cdaf37a57230a1f6bd671651d4161ce040 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 22 Dec 2016 11:46:24 +0800 Subject: [PATCH 1133/3220] f2fs: fix a missing discard prefree segments commit 650d3c4e56e1e92ee6e004648c9deb243e5963e0 upstream. If userspace issue a fstrim with a range not involve prefree segments, it will reuse these segments without discard. This patch fix it. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 13bea6e5120e..f4e41f997ae3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -996,9 +996,13 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (force || !test_opt(sbi, DISCARD)) + if (!test_opt(sbi, DISCARD)) continue; + if (force && start >= cpc->trim_start && + (end - 1) <= cpc->trim_end) + continue; + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); @@ -2343,8 +2347,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, sit_i->dirty_sentries); out: if (cpc->reason == CP_DISCARD) { + __u64 trim_start = cpc->trim_start; + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) add_discard_addrs(sbi, cpc); + + cpc->trim_start = trim_start; } mutex_unlock(&sit_i->sentry_lock); From d4e5223d818323b401fdefaff076a0e67c44894d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 11:51:32 -0800 Subject: [PATCH 1134/3220] f2fs: reassign new segment for mode=lfs commit 9d52a504db6db9e4e254576130aa867838daff55 upstream. Otherwise we can remain wrong curseg->next_blkoff, resulting in fsck failure. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f4e41f997ae3..e6d3f3d4b028 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1508,9 +1508,6 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) unsigned int old_segno; int i; - if (test_opt(sbi, LFS)) - return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { curseg = CURSEG_I(sbi, i); old_segno = curseg->segno; From 8ef4f0ca7b4d0cefe0f0b42af6106e6510f39ae6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 12:13:03 -0800 Subject: [PATCH 1135/3220] f2fs: add submit_bio tracepoint commit 554b5125f5cfca6653461fd52bad24d4ef35ec29 upstream. This patch adds final submit_bio() tracepoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 +++++++----- include/trace/events/f2fs.h | 45 ++++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d7c20880e53e..ccb6fb142d56 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -176,6 +176,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } + if (is_read_io(bio_op(bio))) + trace_f2fs_submit_read_bio(sbi->sb, type, bio); + else + trace_f2fs_submit_write_bio(sbi->sb, type, bio); submit_bio(0, bio); } @@ -186,13 +190,13 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->op)) - trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); - else - trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + if (is_read_io(fio->op)) + trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); + else + trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7ad46e8a89e6..217691582dd4 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -779,12 +779,11 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio, TP_CONDITION(page->mapping) ); -DECLARE_EVENT_CLASS(f2fs__submit_bio, +DECLARE_EVENT_CLASS(f2fs__bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), TP_STRUCT__entry( __field(dev_t, dev) @@ -797,9 +796,9 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->op = fio->op; - __entry->op_flags = fio->op_flags; - __entry->type = fio->type; + __entry->op = bio_op(bio); + __entry->op_flags = bio->bi_rw; + __entry->type = type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; ), @@ -812,22 +811,38 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, __entry->size) ); -DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio, +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_write_bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); -DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio, +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_read_bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_read_bio, + + TP_PROTO(struct super_block *sb, int type, struct bio *bio), + + TP_ARGS(sb, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio, + + TP_PROTO(struct super_block *sb, int type, struct bio *bio), + + TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); From b3fcb70064065bd1aa2083d98a5ec711d099a1ee Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 14 Dec 2016 10:12:56 -0800 Subject: [PATCH 1136/3220] f2fs: support IO alignment for DATA and NODE writes commit 0a595ebaaa6b53a2226d3fee2a2fd616ea5ba378 upstream. This patch implements IO alignment by filling dummy blocks in DATA and NODE write bios. If we can guarantee, for example, 32KB or 64KB for such the IOs, we can eliminate underlying dummy page problem which FTL conducts in order to close MLC or TLC partial written pages. Note that, - it requires "-o mode=lfs". - IO size should be power of 2, not exceed BIO_MAX_PAGES, 256. - read IO is still 4KB. - do checkpoint at fsync, if dummy NODE page was written. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 55 +++++++++++++++++++++++++++++++++++++++-- fs/f2fs/f2fs.h | 4 ++- fs/f2fs/segment.c | 9 +++++-- fs/f2fs/segment.h | 3 +++ fs/f2fs/super.c | 13 +++++++++- include/linux/f2fs_fs.h | 6 +++++ 6 files changed, 84 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ccb6fb142d56..6c8465d26bb1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -94,6 +94,17 @@ static void f2fs_write_end_io(struct bio *bio) struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); + if (IS_DUMMY_WRITTEN_PAGE(page)) { + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + mempool_free(page, sbi->write_io_dummy); + + if (unlikely(bio->bi_error)) + f2fs_stop_checkpoint(sbi, true); + continue; + } + fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { @@ -172,10 +183,42 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { if (!is_read_io(bio_op(bio))) { + unsigned int start; + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); + + if (type != DATA && type != NODE) + goto submit_io; + + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; + start %= F2FS_IO_SIZE(sbi); + + if (start == 0) + goto submit_io; + + /* fill dummy pages */ + for (; start < F2FS_IO_SIZE(sbi); start++) { + struct page *page = + mempool_alloc(sbi->write_io_dummy, + GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + f2fs_bug_on(sbi, !page); + + SetPagePrivate(page); + set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); + lock_page(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + f2fs_bug_on(sbi, 1); + } + /* + * In the NODE case, we lose next block address chain. So, we + * need to do checkpoint in f2fs_sync_file. + */ + if (type == NODE) + set_sbi_flag(sbi, SBI_NEED_CP); } +submit_io: if (is_read_io(bio_op(bio))) trace_f2fs_submit_read_bio(sbi->sb, type, bio); else @@ -320,13 +363,14 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -void f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_mbio(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; bool is_read = is_read_io(fio->op); struct page *bio_page; + int err = 0; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -347,6 +391,12 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { + if ((fio->type == DATA || fio->type == NODE) && + fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { + err = -EAGAIN; + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + goto out_fail; + } io->bio = __bio_alloc(sbi, fio->new_blkaddr, BIO_MAX_PAGES, is_read); io->fio = *fio; @@ -360,9 +410,10 @@ alloc_new: io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); - +out_fail: up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(fio->page, fio); + return err; } static void __set_data_blkaddr(struct dnode_of_data *dn) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 883d3ab388c1..f9a739ffca0f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -859,6 +859,8 @@ struct f2fs_sb_info { struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + int write_io_size_bits; /* Write IO size bits */ + mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -2241,7 +2243,7 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, struct page *, nid_t, enum page_type, int); void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); -void f2fs_submit_page_mbio(struct f2fs_io_info *); +int f2fs_submit_page_mbio(struct f2fs_io_info *); struct block_device *f2fs_target_device(struct f2fs_sb_info *, block_t, struct bio *); int f2fs_target_device_index(struct f2fs_sb_info *, block_t); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e6d3f3d4b028..a7bb97826445 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1684,15 +1684,20 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); + int err; if (fio->type == NODE || fio->type == DATA) mutex_lock(&fio->sbi->wio_mutex[fio->type]); - +reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ - f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_mbio(fio); + if (err == -EAGAIN) { + fio->old_blkaddr = fio->new_blkaddr; + goto reallocate; + } if (fio->type == NODE || fio->type == DATA) mutex_unlock(&fio->sbi->wio_mutex[fio->type]); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9d44ce83acb2..08f1455c812c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -186,9 +186,12 @@ struct segment_allocation { * the page is atomically written, and it is in inmem_pages list. */ #define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) +#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) +#define IS_DUMMY_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) struct inmem_pages { struct list_head list; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e6d8d011786c..fb9f6c09fa11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1764,6 +1764,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).total_segments, FDEV(i).start_blk, FDEV(i).end_blk); } + f2fs_msg(sbi->sb, KERN_INFO, + "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); return 0; } @@ -1881,12 +1883,19 @@ try_onemore: if (err) goto free_options; + if (F2FS_IO_SIZE(sbi) > 1) { + sbi->write_io_dummy = + mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0); + if (!sbi->write_io_dummy) + goto free_options; + } + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_options; + goto free_io_dummy; } err = get_valid_checkpoint(sbi); @@ -2104,6 +2113,8 @@ free_devices: free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); +free_io_dummy: + mempool_destroy(sbi->write_io_dummy); free_options: destroy_percpu_info(sbi); kfree(options); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 3e5972ef5019..cf54a312993f 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -36,6 +36,12 @@ #define F2FS_NODE_INO(sbi) (sbi->node_ino_num) #define F2FS_META_INO(sbi) (sbi->meta_ino_num) +#define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ +#define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ +#define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */ +#define F2FS_IO_SIZE_BITS(sbi) ((sbi)->write_io_size_bits) /* power of 2 */ +#define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) + /* This flag is used by node and meta inodes, and by recovery */ #define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO) #define GFP_F2FS_HIGH_ZERO (GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM) From 7b214391b292efcfe6d55202498766f20875f5b1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 17:09:19 -0800 Subject: [PATCH 1137/3220] f2fs: get io size bit from mount option commit ec91538dccd44329ad83d3aae1aa6a8389b5c75f upstream. This patch adds to set io_size_bits from mount option. Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 ++ fs/f2fs/super.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 753dd4f96afe..d99faced79cb 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -157,6 +157,8 @@ data_flush Enable data flushing before checkpoint in order to mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. +io_bits=%u Set the bit size of write IO requests. It should be set + with "mode=lfs". ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fb9f6c09fa11..3b169927408e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -101,6 +101,7 @@ enum { Opt_noinline_data, Opt_data_flush, Opt_mode, + Opt_io_size_bits, Opt_fault_injection, Opt_lazytime, Opt_nolazytime, @@ -133,6 +134,7 @@ static match_table_t f2fs_tokens = { {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, {Opt_mode, "mode=%s"}, + {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, @@ -535,6 +537,17 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_io_size_bits: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + f2fs_msg(sb, KERN_WARNING, + "Not support %d, larger than %d", + 1 << arg, BIO_MAX_PAGES); + return -EINVAL; + } + sbi->write_io_size_bits = arg; + break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; @@ -558,6 +571,13 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } + + if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { + f2fs_msg(sb, KERN_ERR, + "Should set mode=lfs with %uKB-sized IO", + F2FS_IO_SIZE_KB(sbi)); + return -EINVAL; + } return 0; } @@ -918,6 +938,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); + if (F2FS_IO_SIZE_BITS(sbi)) + seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); return 0; } From 97a43c7059c1323ac34693b1c7539e7a42740ee6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Dec 2016 13:55:09 -0800 Subject: [PATCH 1138/3220] f2fs: show the max number of atomic operations commit 26a28a0c1eb756ba18bfb1f93309c4b4406b9cd9 upstream. This patch adds to show the max number of atomic operations which are conducting concurrently. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 7 +++++++ fs/f2fs/f2fs.h | 17 +++++++++++++++++ fs/f2fs/file.c | 8 ++++++-- fs/f2fs/segment.c | 1 + 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fbd5184140d0..29cdf0c1da1d 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,6 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); + si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; @@ -256,6 +258,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Orphan Inode: %u\n", si->orphans); + seq_printf(s, " - Atomic write count: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -414,6 +418,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); + atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->max_aw_cnt, 0); + mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); mutex_unlock(&f2fs_stat_mutex); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f9a739ffca0f..19e054b1c4f8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -951,6 +951,8 @@ struct f2fs_sb_info { atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ + atomic_t aw_cnt; /* # of atomic writes */ + atomic_t max_aw_cnt; /* max # of atomic writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif @@ -2303,6 +2305,7 @@ struct f2fs_stat_info { int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2374,6 +2377,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) +#define stat_inc_atomic_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)); +#define stat_dec_atomic_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)); +#define stat_update_max_atomic_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -2427,6 +2441,9 @@ void f2fs_destroy_root_stats(void); #define stat_dec_inline_inode(inode) #define stat_inc_inline_dir(inode) #define stat_dec_inline_dir(inode) +#define stat_inc_atomic_write(inode) +#define stat_dec_atomic_write(inode) +#define stat_update_max_atomic_write(inode) #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_inplace_blocks(sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5808d5c709a7..d7eacef08797 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1546,6 +1546,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) clear_inode_flag(inode, FI_ATOMIC_FILE); out: + stat_inc_atomic_write(inode); + stat_update_max_atomic_write(inode); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1575,9 +1577,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) set_inode_flag(inode, FI_ATOMIC_FILE); goto err_out; } + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + stat_dec_atomic_write(inode); + } else { + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } - - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: inode_unlock(inode); mnt_drop_write_file(filp); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a7bb97826445..353ec85b3835 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -243,6 +243,7 @@ void drop_inmem_pages(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); From dde5a6f8fd97e05a437995f121cc2f50353bef9c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Dec 2016 17:31:15 -0800 Subject: [PATCH 1139/3220] f2fs: don't allow encrypted operations without keys commit 363fa4e078cbdc97a172c19d19dc04b41b52ebc8 upstream. This patch fixes the renaming bug on encrypted filenames, which was pointed by (ext4: don't allow encrypted operations without keys) Cc: Theodore Ts'o Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ca9e2f85eae8..db3079cd665d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -660,6 +660,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; @@ -840,6 +846,12 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && (old_dir != new_dir) && (!fscrypt_has_permitted_context(new_dir, old_inode) || From 66e2310bf98024a658fcdd10d52f827d302ef91e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 3 Jan 2017 17:19:30 -0800 Subject: [PATCH 1140/3220] f2fs: drop exist_data for inline_data when truncated to 0 commit bb95d9ab2a9d4afd03b59a603cccb2c601f68b78 upstream. A test program gets the SEEK_DATA with two values between a new created file and the exist file on f2fs filesystem. F2FS filesystem, (the first "test1" is a new file) SEEK_DATA size != 0 (offset = 8192) SEEK_DATA size != 0 (offset = 4096) PNFS filesystem, (the first "test1" is a new file) SEEK_DATA size != 0 (offset = 4096) SEEK_DATA size != 0 (offset = 4096) int main(int argc, char **argv) { char *filename = argv[1]; int offset = 1, i = 0, fd = -1; if (argc < 2) { printf("Usage: %s f2fsfilename\n", argv[0]); return -1; } /* if (!access(filename, F_OK) || errno != ENOENT) { printf("Needs a new file for test, %m\n"); return -1; }*/ fd = open(filename, O_RDWR | O_CREAT, 0777); if (fd < 0) { printf("Create test file %s failed, %m\n", filename); return -1; } for (i = 0; i < 20; i++) { offset = 1 << i; ftruncate(fd, 0); lseek(fd, offset, SEEK_SET); write(fd, "test", 5); /* Get the alloc size by seek data equal zero*/ if (lseek(fd, 0, SEEK_DATA)) { printf("SEEK_DATA size != 0 (offset = %d)\n", offset); break; } } close(fd); return 0; } Reported-and-Tested-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d7eacef08797..9da13847cda4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -571,6 +571,8 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) if (f2fs_has_inline_data(inode)) { if (truncate_inline_inode(ipage, from)) set_page_dirty(ipage); + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); f2fs_put_page(ipage, 1); truncate_page = true; goto out; From 132263ddad34592c1c469f5ed1e192c6df6b9a13 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 14:07:53 -0800 Subject: [PATCH 1141/3220] f2fs: relax async discard commands more commit 4e6a8d9b224f886362ea6e8f6046b541437c944f upstream. This patch relaxes async discard commands to avoid waiting its end_io during checkpoint. Instead of waiting them during checkpoint, it will be done when actually reusing them. Test on initial partition of nvme drive. # time fstrim /mnt/test Before : 6.158s After : 4.822s Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 ++----- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/segment.c | 24 +++++++++++++++++++----- fs/f2fs/super.c | 3 +++ 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d485bea3d6bb..2ed785e5ffbb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1255,7 +1255,6 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, prefree_segments(sbi)); flush_sit_entries(sbi, cpc); clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); unblock_operations(sbi); goto out; } @@ -1274,12 +1273,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - if (err) { + if (err) release_discard_addrs(sbi); - } else { + else clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); - } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 19e054b1c4f8..3409392dde9c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -250,6 +250,8 @@ struct discard_entry { struct bio_entry { struct list_head list; + block_t lstart; + block_t len; struct bio *bio; struct completion event; int error; @@ -2178,7 +2180,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); +void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 353ec85b3835..fa3d4f8db389 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -625,20 +625,23 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, - struct bio *bio) + struct bio *bio, block_t lstart, block_t len) { struct list_head *wait_list = &(SM_I(sbi)->wait_list); struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); INIT_LIST_HEAD(&be->list); be->bio = bio; + be->lstart = lstart; + be->len = len; init_completion(&be->event); list_add_tail(&be->list, wait_list); return be; } -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct list_head *wait_list = &(SM_I(sbi)->wait_list); struct bio_entry *be, *tmp; @@ -647,7 +650,15 @@ void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) struct bio *bio = be->bio; int err; - wait_for_completion_io(&be->event); + if (!completion_done(&be->event)) { + if ((be->lstart <= blkaddr && + blkaddr < be->lstart + be->len) || + blkaddr == NULL_ADDR) + wait_for_completion_io(&be->event); + else + continue; + } + err = be->error; if (err == -EOPNOTSUPP) err = 0; @@ -756,6 +767,7 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { struct bio *bio = NULL; + block_t lblkstart = blkstart; int err; trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); @@ -770,13 +782,13 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct bio_entry *be = __add_bio_entry(sbi, bio); + struct bio_entry *be = __add_bio_entry(sbi, bio, + lblkstart, blklen); bio->bi_private = be; bio->bi_end_io = f2fs_submit_bio_wait_endio; submit_bio(REQ_SYNC, bio); } - return err; } @@ -1655,6 +1667,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + f2fs_wait_discard_bio(sbi, *new_blkaddr); + /* * __add_sum_entry should be resided under the curseg_mutex * because, this function updates a summary entry in the diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3b169927408e..84d5686c4aa4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -770,6 +770,9 @@ static void f2fs_put_super(struct super_block *sb) write_checkpoint(sbi, &cpc); } + /* be sure to wait for any on-going discard commands */ + f2fs_wait_discard_bio(sbi, NULL_ADDR); + /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); From d051ccbd1bfcd04283be206edbe6e5e9dbfe6cb8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 16:58:54 -0800 Subject: [PATCH 1142/3220] f2fs: avoid needless checkpoint in f2fs_trim_fs commit 0333ad4e4f49e14217256e1db1134a70cf60b2de upstream. The f2fs_trim_fs() doesn't need to do checkpoint if there are newly allocated data blocks only which didn't change the critical checkpoint data such as nat and sit entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2ed785e5ffbb..886b96c12c31 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1249,14 +1249,15 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { - f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); - f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); - f2fs_bug_on(sbi, prefree_segments(sbi)); - flush_sit_entries(sbi, cpc); - clear_prefree_segments(sbi, cpc); - unblock_operations(sbi); - goto out; + if (cpc->reason == CP_DISCARD) { + if (NM_I(sbi)->dirty_nat_cnt == 0 && + SIT_I(sbi)->dirty_sentries == 0 && + prefree_segments(sbi) == 0) { + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } } /* From 556f5ba3497207ee447f1a163133278e2a8f8376 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 22:06:15 -0800 Subject: [PATCH 1143/3220] f2fs: return fs_trim if there is no candidate commit 25290fa5591d81767713db304e0d567bf991786f upstream. If there is no candidate to submit discard command during f2sf_trim_fs, let's return without checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 28 +++++++++++++++++++++++----- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 886b96c12c31..fbf04d4d7964 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1250,6 +1250,11 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* this is the case of multiple fstrims without any changes */ if (cpc->reason == CP_DISCARD) { + if (!exist_trim_candidates(sbi, cpc)) { + unblock_operations(sbi); + goto out; + } + if (NM_I(sbi)->dirty_nat_cnt == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3409392dde9c..3eb53e3a8eae 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2186,6 +2186,7 @@ void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); +bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); void update_meta_page(struct f2fs_sb_info *, void *, block_t); void write_meta_page(struct f2fs_sb_info *, struct page *); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fa3d4f8db389..12f8d5ab7ccf 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -914,7 +914,8 @@ done: SM_I(sbi)->nr_discards += end - start; } -static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, + bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); int max_blocks = sbi->blocks_per_seg; @@ -928,12 +929,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) - return; + return false; if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) - return; + return false; } /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ @@ -951,8 +952,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) && (end - start) < cpc->trim_minlen) continue; + if (check_only) + return true; + __add_discard_entry(sbi, cpc, se, start, end); } + return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) @@ -1533,6 +1538,19 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + __u64 trim_start = cpc->trim_start; + + mutex_lock(&SIT_I(sbi)->sentry_lock); + for (; trim_start <= cpc->trim_end; trim_start++) + if (add_discard_addrs(sbi, cpc, true)) + break; + mutex_unlock(&SIT_I(sbi)->sentry_lock); + + return trim_start <= cpc->trim_end; +} + int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); @@ -2329,7 +2347,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* add discard candidates */ if (cpc->reason != CP_DISCARD) { cpc->trim_start = segno; - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); } if (to_journal) { @@ -2367,7 +2385,7 @@ out: __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); cpc->trim_start = trim_start; } From 7129702a487ad1a4689bdf41c3657f829472c18c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:49:42 +0800 Subject: [PATCH 1144/3220] f2fs: clean up with list_{first, last}_entry commit 939afa943c5290a3b92f01612a792af17bc98115 upstream. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 4 ++-- fs/f2fs/node.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fbf04d4d7964..45ef3b6bfb04 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -892,7 +892,7 @@ retry: F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { @@ -925,7 +925,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) spin_unlock(&sbi->inode_lock[DIRTY_META]); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, + fi = list_first_entry(head, struct f2fs_inode_info, gdirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6c8465d26bb1..f1001c871cb6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1144,7 +1144,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, prefetchw(&page->flags); if (pages) { - page = list_entry(pages->prev, struct page, lru); + page = list_last_entry(pages, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) @@ -1262,7 +1262,7 @@ static int f2fs_read_data_pages(struct file *file, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_mapping->host; - struct page *page = list_entry(pages->prev, struct page, lru); + struct page *page = list_last_entry(pages, struct page, lru); trace_f2fs_readpages(inode, page, nr_pages); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e7997e240366..9278b21ee073 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -174,7 +174,7 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); From dc8b8cea1e70b6649498a9850745c394e39d290c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:50:26 +0800 Subject: [PATCH 1145/3220] f2fs: introduce FI_ATOMIC_COMMIT commit 5fe457430e554a2f5188f13c1a2e36ad845640c5 upstream. This patch introduces a new flag to indicate inode status of doing atomic write committing, so that, we can keep atomic write status for inode during atomic committing, then we can skip GCing pages of atomic write inode, that avoids random GCed datas being mixed with current transaction, so isolation of transaction can be kept. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/file.c | 11 ++++++----- fs/f2fs/gc.c | 6 ++++++ fs/f2fs/segment.c | 10 +++++++--- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f1001c871cb6..4ac72a3f920a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2017,7 +2017,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); - if (f2fs_is_atomic_file(inode)) { + if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { register_inmem_page(inode, page); return 1; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3eb53e3a8eae..807855d37c63 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1706,6 +1706,7 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ + FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ @@ -1895,6 +1896,11 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } +static inline bool f2fs_is_commit_atomic_write(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); +} + static inline bool f2fs_is_volatile_file(struct inode *inode) { return is_inode_flag_set(inode, FI_VOLATILE_FILE); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9da13847cda4..e4e5d76d80b0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1573,14 +1573,15 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) goto err_out; if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); ret = commit_inmem_pages(inode); - if (ret) { - set_inode_flag(inode, FI_ATOMIC_FILE); + if (ret) goto err_out; - } + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - stat_dec_atomic_write(inode); + if (!ret) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); + } } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d3a36e4b442c..7f0c3e02408c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -569,6 +569,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; + if (f2fs_is_atomic_file(inode)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -661,6 +664,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; + if (f2fs_is_atomic_file(inode)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 12f8d5ab7ccf..6a870677d58a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -242,12 +242,12 @@ void drop_inmem_pages(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - clear_inode_flag(inode, FI_ATOMIC_FILE); - stat_dec_atomic_write(inode); - mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); mutex_unlock(&fi->inmem_lock); + + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); } static int __commit_inmem_pages(struct inode *inode, @@ -316,6 +316,8 @@ int commit_inmem_pages(struct inode *inode) f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); + set_inode_flag(inode, FI_ATOMIC_COMMIT); + mutex_lock(&fi->inmem_lock); err = __commit_inmem_pages(inode, &revoke_list); if (err) { @@ -337,6 +339,8 @@ int commit_inmem_pages(struct inode *inode) } mutex_unlock(&fi->inmem_lock); + clear_inode_flag(inode, FI_ATOMIC_COMMIT); + f2fs_unlock_op(sbi); return err; } From dd5804b2146ef7a3c8e4017be426e07ec25a49fa Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:51:01 +0800 Subject: [PATCH 1146/3220] f2fs: check in-memory block bitmap commit 355e78913c0d57492076d545b6f44b94fec2bf6b upstream. This patch adds a mirror for valid block bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 32 ++++++++++++++++++++++++++++++-- fs/f2fs/segment.h | 6 ++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6a870677d58a..aae1c2ea7a1d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1101,14 +1101,32 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) + if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir)) + f2fs_bug_on(sbi, 1); + else + WARN_ON(1); +#else f2fs_bug_on(sbi, 1); +#endif + } if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) + if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir)) + f2fs_bug_on(sbi, 1); + else + WARN_ON(1); +#else f2fs_bug_on(sbi, 1); +#endif + } if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -2432,6 +2450,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sentries[start].cur_valid_map_mir + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map_mir) + return -ENOMEM; +#endif + if (f2fs_discard_en(sbi)) { sit_i->sentries[start].discard_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2861,6 +2886,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) { for (start = 0; start < MAIN_SEGS(sbi); start++) { kfree(sit_i->sentries[start].cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sentries[start].cur_valid_map_mir); +#endif kfree(sit_i->sentries[start].ckpt_valid_map); kfree(sit_i->sentries[start].discard_map); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 08f1455c812c..9af95194db06 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -164,6 +164,9 @@ struct seg_entry { unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ +#ifdef CONFIG_F2FS_CHECK_FS + unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ +#endif /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. @@ -320,6 +323,9 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se, se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#ifdef CONFIG_F2FS_CHECK_FS + memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#endif se->type = GET_SIT_TYPE(rs); se->mtime = le64_to_cpu(rs->mtime); } From 5c53448ff2e90f63fbb7bc955fe67e66d5974dc1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:52:01 +0800 Subject: [PATCH 1147/3220] f2fs: check in-memory nat version bitmap commit 599a09b2c1ac222e6aad0c22515d1ccde7c3b702 upstream. This patch adds a mirror for nat version bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/node.c | 11 +++++++++++ fs/f2fs/node.h | 15 +++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 807855d37c63..d4783d9cf4e0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -607,6 +607,9 @@ struct f2fs_nm_info { /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *nat_bitmap_mir; /* NAT bitmap mirror */ +#endif int bitmap_size; /* bitmap size */ }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b01b01cfc39e..bc67dc323f7e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2366,6 +2366,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi) GFP_KERNEL); if (!nm_i->nat_bitmap) return -ENOMEM; + +#ifdef CONFIG_F2FS_CHECK_FS + nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap_mir) + return -ENOMEM; +#endif + return 0; } @@ -2440,6 +2448,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) up_write(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(nm_i->nat_bitmap_mir); +#endif sbi->nm_info = NULL; kfree(nm_i); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 9278b21ee073..29ff783eb9c3 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -186,6 +186,12 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, + nm_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); } @@ -203,6 +209,12 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) (seg_off << sbi->log_blocks_per_seg << 1) + (block_off & (sbi->blocks_per_seg - 1))); +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(block_off, nm_i->nat_bitmap) != + f2fs_test_bit(block_off, nm_i->nat_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -228,6 +240,9 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); f2fs_change_bit(block_off, nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); +#endif } static inline nid_t ino_of_node(struct page *node_page) From fb2e2f44afe51eb5b13cb15c7d722296224d6cde Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:52:34 +0800 Subject: [PATCH 1148/3220] f2fs: check in-memory sit version bitmap commit ae27d62e6befd3cac4ffa702e644cc52019642e8 upstream. This patch adds a mirror for sit version bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 ++++++++++++---- fs/f2fs/segment.h | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index aae1c2ea7a1d..c39bbffb0cac 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2421,7 +2421,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; - char *src_bitmap, *dst_bitmap; + char *src_bitmap; unsigned int bitmap_size; /* allocate memory for SIT information */ @@ -2483,17 +2483,22 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); - if (!dst_bitmap) + sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap_mir) + return -ENOMEM; +#endif + /* init SIT information */ sit_i->s_ops = &default_salloc_ops; sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; - sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; @@ -2901,6 +2906,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sit_bitmap_mir); +#endif kfree(sit_i); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9af95194db06..5cb5755c75d9 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -209,6 +209,9 @@ struct sit_info { block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ char *sit_bitmap; /* SIT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *sit_bitmap_mir; /* SIT bitmap mirror */ +#endif unsigned int bitmap_size; /* SIT bitmap size */ unsigned long *tmp_map; /* bitmap for temporal use */ @@ -423,6 +426,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, void *dst_addr) { struct sit_info *sit_i = SIT_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, + sit_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); } @@ -643,6 +652,12 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, check_seg_range(sbi, start); +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(offset, sit_i->sit_bitmap) != + f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + /* calculate sit block address */ if (f2fs_test_bit(offset, sit_i->sit_bitmap)) blk_addr += sit_i->sit_blocks; @@ -668,6 +683,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) unsigned int block_off = SIT_BLOCK_OFFSET(start); f2fs_change_bit(block_off, sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); +#endif } static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) From eee3f1f5105a3984bb0572750c2107051ffb2d90 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Jan 2017 14:13:03 -0800 Subject: [PATCH 1149/3220] f2fs: clean up flush/discard command namings commit b01a92019cac30398ef75b560d2668b399f4e393 upstream. This patch simply cleans up the names for flush/discard commands. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- fs/f2fs/f2fs.h | 20 +++++----- fs/f2fs/segment.c | 98 +++++++++++++++++++++++------------------------ 3 files changed, 59 insertions(+), 61 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 29cdf0c1da1d..883f1ea9e0b6 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -194,7 +194,7 @@ get_cache: si->cache_mem += sizeof(struct f2fs_gc_kthread); /* build merge flush thread */ - if (SM_I(sbi)->cmd_control_info) + if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d4783d9cf4e0..167c5f841b5f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -248,13 +248,12 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; -struct bio_entry { - struct list_head list; - block_t lstart; - block_t len; - struct bio *bio; - struct completion event; - int error; +struct discard_cmd { + struct list_head list; /* command list */ + struct completion wait; /* compleation */ + block_t lstart; /* logical start address */ + block_t len; /* length */ + struct bio *bio; /* bio */ }; /* for the list of fsync inodes, used only during recovery */ @@ -701,8 +700,8 @@ struct f2fs_sm_info { unsigned int rec_prefree_segments; /* for small discard management */ - struct list_head discard_list; /* 4KB discard list */ - struct list_head wait_list; /* linked with issued discard bio */ + struct list_head discard_entry_list; /* 4KB discard entry list */ + struct list_head discard_cmd_list; /* discard cmd list */ int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ @@ -716,8 +715,7 @@ struct f2fs_sm_info { unsigned int min_fsync_blocks; /* threshold for fsync */ /* for flush command control */ - struct flush_cmd_control *cmd_control_info; - + struct flush_cmd_control *fcc_info; }; /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c39bbffb0cac..289b3facd2d8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,7 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; -static struct kmem_cache *bio_entry_slab; +static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -439,7 +439,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi) static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) @@ -468,7 +468,7 @@ repeat: int f2fs_issue_flush(struct f2fs_sb_info *sbi) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), @@ -511,8 +511,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; - if (SM_I(sbi)->cmd_control_info) { - fcc = SM_I(sbi)->cmd_control_info; + if (SM_I(sbi)->fcc_info) { + fcc = SM_I(sbi)->fcc_info; goto init_thread; } @@ -522,14 +522,14 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); - SM_I(sbi)->cmd_control_info = fcc; + SM_I(sbi)->fcc_info = fcc; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; return err; } @@ -538,7 +538,7 @@ init_thread: void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; if (fcc && fcc->f2fs_issue_flush) { struct task_struct *flush_thread = fcc->f2fs_issue_flush; @@ -548,7 +548,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) } if (free) { kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; } } @@ -628,42 +628,43 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, +static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, struct bio *bio, block_t lstart, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd *dc; - INIT_LIST_HEAD(&be->list); - be->bio = bio; - be->lstart = lstart; - be->len = len; - init_completion(&be->event); - list_add_tail(&be->list, wait_list); + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + INIT_LIST_HEAD(&dc->list); + dc->bio = bio; + dc->lstart = lstart; + dc->len = len; + init_completion(&dc->wait); + list_add_tail(&dc->list, wait_list); - return be; + return dc; } /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be, *tmp; + struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd *dc, *tmp; - list_for_each_entry_safe(be, tmp, wait_list, list) { - struct bio *bio = be->bio; + list_for_each_entry_safe(dc, tmp, wait_list, list) { + struct bio *bio = dc->bio; int err; - if (!completion_done(&be->event)) { - if ((be->lstart <= blkaddr && - blkaddr < be->lstart + be->len) || + if (!completion_done(&dc->wait)) { + if ((dc->lstart <= blkaddr && + blkaddr < dc->lstart + dc->len) || blkaddr == NULL_ADDR) - wait_for_completion_io(&be->event); + wait_for_completion_io(&dc->wait); else continue; } - err = be->error; + err = bio->bi_error; if (err == -EOPNOTSUPP) err = 0; @@ -672,17 +673,16 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) "Issue discard failed, ret: %d", err); bio_put(bio); - list_del(&be->list); - kmem_cache_free(bio_entry_slab, be); + list_del(&dc->list); + kmem_cache_free(discard_cmd_slab, dc); } } -static void f2fs_submit_bio_wait_endio(struct bio *bio) +static void f2fs_submit_discard_endio(struct bio *bio) { - struct bio_entry *be = (struct bio_entry *)bio->bi_private; + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - be->error = bio->bi_error; - complete(&be->event); + complete(&dc->wait); } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -786,11 +786,11 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct bio_entry *be = __add_bio_entry(sbi, bio, + struct discard_cmd *dc = __add_discard_cmd(sbi, bio, lblkstart, blklen); - bio->bi_private = be; - bio->bi_end_io = f2fs_submit_bio_wait_endio; + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); } return err; @@ -897,7 +897,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) { - struct list_head *head = &SM_I(sbi)->discard_list; + struct list_head *head = &SM_I(sbi)->discard_entry_list; struct discard_entry *new, *last; if (!list_empty(head)) { @@ -966,7 +966,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->discard_entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -992,7 +992,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->discard_entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct blk_plug plug; @@ -2783,8 +2783,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - INIT_LIST_HEAD(&sm_info->discard_list); - INIT_LIST_HEAD(&sm_info->wait_list); + INIT_LIST_HEAD(&sm_info->discard_entry_list); + INIT_LIST_HEAD(&sm_info->discard_cmd_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; @@ -2934,15 +2934,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; - bio_entry_slab = f2fs_kmem_cache_create("bio_entry", - sizeof(struct bio_entry)); - if (!bio_entry_slab) + discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd", + sizeof(struct discard_cmd)); + if (!discard_cmd_slab) goto destroy_discard_entry; sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destroy_bio_entry; + goto destroy_discard_cmd; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2952,8 +2952,8 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destroy_bio_entry: - kmem_cache_destroy(bio_entry_slab); +destroy_discard_cmd: + kmem_cache_destroy(discard_cmd_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: @@ -2963,7 +2963,7 @@ fail: void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); - kmem_cache_destroy(bio_entry_slab); + kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } From d78a12988cc5d5080a2c1d03a9308b648c45a8d3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 10:21:15 -0800 Subject: [PATCH 1150/3220] f2fs: reorganize stat information commit d4adb30f25f5f2aa9b205891e395251d2a9098be upstream. This patch modifies stat information more clearly. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 883f1ea9e0b6..cd338ca24941 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -258,8 +258,6 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Orphan Inode: %u\n", si->orphans); - seq_printf(s, " - Atomic write count: %4d (Max. %4d)\n", - si->aw_cnt, si->max_aw_cnt); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -318,8 +316,10 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", - si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - IO (CP: %4d, Data: %4d)\n", + si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", From 69fbcb521aae48b51b8b0b673390ed05264bdd60 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 27 Sep 2017 17:18:48 -0700 Subject: [PATCH 1151/3220] ANDROID: add script to fetch android kernel config fragments The Android kernel config fragments now live in a separate repository. To prevent others from having to search for this location, add a script to fetch and unpack the fragments. Update .gitignore to include these fragments. Change-Id: If2d4a59b86e4573b0a9b3190025dfe4191870b46 Signed-off-by: Steve Muckle --- .gitignore | 3 +++ android/configs/android-fetch-configs.sh | 4 ++++ 2 files changed, 7 insertions(+) create mode 100755 android/configs/android-fetch-configs.sh diff --git a/.gitignore b/.gitignore index fd3a35592543..fa3e5f1d0808 100644 --- a/.gitignore +++ b/.gitignore @@ -112,3 +112,6 @@ all.config # Kdevelop4 *.kdev4 + +# fetched Android config fragments +android/configs/android-*.cfg diff --git a/android/configs/android-fetch-configs.sh b/android/configs/android-fetch-configs.sh new file mode 100755 index 000000000000..9915c1356ed3 --- /dev/null +++ b/android/configs/android-fetch-configs.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +curl https://android.googlesource.com/kernel/configs/+archive/master/android-4.4.tar.gz | tar xzv + From b0fa18e1caa5390619cfb6878b1b63879908ed10 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 3 May 2017 14:30:48 +0100 Subject: [PATCH 1152/3220] UPSTREAM: cpufreq: schedutil: use now as reference when aggregating shared policy requests Currently, sugov_next_freq_shared() uses last_freq_update_time as a reference to decide when to start considering CPU contributions as stale. However, since last_freq_update_time is set by the last CPU that issued a frequency transition, this might cause problems in certain cases. In practice, the detection of stale utilization values fails whenever the CPU with such values was the last to update the policy. For example (and please note again that the SCHED_CPUFREQ_RT flag is not the problem here, but only the detection of after how much time that flag has to be considered stale), suppose a policy with 2 CPUs: CPU0 | CPU1 | | RT task scheduled | SCHED_CPUFREQ_RT is set | CPU1->last_update = now | freq transition to max | last_freq_update_time = now | more than TICK_NSEC nsecs | a small CFS wakes up | CPU0->last_update = now1 | delta_ns(CPU0) < TICK_NSEC* | CPU0's util is considered | delta_ns(CPU1) = | last_freq_update_time - | CPU1->last_update = 0 | < TICK_NSEC | CPU1 is still considered | CPU1->SCHED_CPUFREQ_RT is set | we stay at max (until CPU1 | exits from idle) | * delta_ns is actually negative as now1 > last_freq_update_time While last_freq_update_time is a sensible reference for rate limiting, it doesn't seem to be useful for working around stale CPU states. Fix the problem by always considering now (time) as the reference for deciding when CPUs have stale contributions. Signed-off-by: Juri Lelli Acked-by: Vincent Guittot Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki (cherry picked from commit d86ab9cff8b936aadde444d0e263a8db5ff0349b) --- kernel/sched/cpufreq_schedutil.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index b90f7434e13b..28977799017b 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -322,11 +322,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - u64 last_freq_update_time = sg_policy->last_freq_update_time; unsigned long util = 0, max = 1; unsigned int j; @@ -342,7 +341,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) * enough, don't take the CPU into account as it probably is * idle now (and clear iowait_boost for it). */ - delta_ns = last_freq_update_time - j_sg_cpu->last_update; + delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost_pending = false; @@ -387,7 +386,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_DL) next_f = sg_policy->policy->cpuinfo.max_freq; else - next_f = sugov_next_freq_shared(sg_cpu); + next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); } From 13f002354db13a029aef84aa1a8bfa51bd6b8d56 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Jan 2017 18:16:29 -0800 Subject: [PATCH 1153/3220] f2fs: catch up to v4.14-rc1 This is cherry-picked from upstrea-f2fs-stable-linux-4.4.y. Changes include: commit c7fd9e2b4a6876 ("f2fs: hurry up to issue discard after io interruption") commit 603dde39653d6d ("f2fs: fix to show correct discard_granularity in sysfs") ... commit 565f0225f95f15 ("f2fs: factor out discard command info into discard_cmd_control") commit c4cc29d19eaf01 ("f2fs: remove batched discard in f2fs_trim_fs") Change-Id: Icd8a85ac0c19a8aa25cd2591a12b4e9b85bdf1c5 Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 53 + Documentation/filesystems/f2fs.txt | 22 + fs/crypto/Makefile | 1 + fs/crypto/bio.c | 143 +++ fs/crypto/crypto.c | 278 ++--- fs/crypto/fname.c | 111 +- fs/crypto/fscrypt_private.h | 107 ++ fs/crypto/keyinfo.c | 270 +++-- fs/crypto/policy.c | 243 ++-- fs/f2fs/Makefile | 2 +- fs/f2fs/acl.c | 9 +- fs/f2fs/checkpoint.c | 217 +++- fs/f2fs/crypto_key.c | 240 ---- fs/f2fs/crypto_policy.c | 248 ----- fs/f2fs/data.c | 638 +++++++---- fs/f2fs/debug.c | 67 +- fs/f2fs/dir.c | 106 +- fs/f2fs/extent_cache.c | 368 +++--- fs/f2fs/f2fs.h | 1356 +++++++++++++++-------- fs/f2fs/f2fs_crypto.h | 150 --- fs/f2fs/file.c | 473 +++++--- fs/f2fs/gc.c | 265 +++-- fs/f2fs/gc.h | 27 +- fs/f2fs/hash.c | 7 +- fs/f2fs/inline.c | 174 +-- fs/f2fs/inode.c | 168 ++- fs/f2fs/namei.c | 182 ++- fs/f2fs/node.c | 723 ++++++++---- fs/f2fs/node.h | 65 +- fs/f2fs/recovery.c | 102 +- fs/f2fs/segment.c | 1292 ++++++++++++++++----- fs/f2fs/segment.h | 197 ++-- fs/f2fs/super.c | 1255 ++++++++++++++------- fs/f2fs/sysfs.c | 556 ++++++++++ fs/f2fs/trace.c | 4 +- fs/f2fs/xattr.c | 178 ++- fs/f2fs/xattr.h | 11 +- include/linux/f2fs_fs.h | 43 +- include/linux/fscrypt_common.h | 138 +++ include/linux/fscrypt_notsupp.h | 177 +++ include/linux/fscrypt_supp.h | 145 +++ include/linux/fscrypto.h | 12 +- include/trace/events/f2fs.h | 213 +++- include/uapi/linux/fs.h | 30 + mm/util.c | 1 + 45 files changed, 7588 insertions(+), 3479 deletions(-) create mode 100644 fs/crypto/bio.c create mode 100644 fs/crypto/fscrypt_private.h delete mode 100644 fs/f2fs/crypto_key.c delete mode 100644 fs/f2fs/crypto_policy.c delete mode 100644 fs/f2fs/f2fs_crypto.h create mode 100644 fs/f2fs/sysfs.c create mode 100644 include/linux/fscrypt_common.h create mode 100644 include/linux/fscrypt_notsupp.h create mode 100644 include/linux/fscrypt_supp.h diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 0345f2d1c727..500c60403653 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -57,6 +57,15 @@ Contact: "Jaegeuk Kim" Description: Controls the issue rate of small discard commands. +What: /sys/fs/f2fs//discard_granularity +Date: July 2017 +Contact: "Chao Yu" +Description: + Controls discard granularity of inner discard thread, inner thread + will not issue discards with size that is smaller than granularity. + The unit size is one block, now only support configuring in range + of [1, 512]. + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" @@ -92,3 +101,47 @@ Date: October 2015 Contact: "Chao Yu" Description: Controls the count of nid pages to be readaheaded. + +What: /sys/fs/f2fs//dirty_nats_ratio +Date: January 2016 +Contact: "Chao Yu" +Description: + Controls dirty nat entries ratio threshold, if current + ratio exceeds configured threshold, checkpoint will + be triggered for flushing dirty nat entries. + +What: /sys/fs/f2fs//lifetime_write_kbytes +Date: January 2016 +Contact: "Shuoran Liu" +Description: + Shows total written kbytes issued to disk. + +What: /sys/fs/f2fs//inject_rate +Date: May 2016 +Contact: "Sheng Yong" +Description: + Controls the injection rate. + +What: /sys/fs/f2fs//inject_type +Date: May 2016 +Contact: "Sheng Yong" +Description: + Controls the injection type. + +What: /sys/fs/f2fs//reserved_blocks +Date: June 2017 +Contact: "Chao Yu" +Description: + Controls current reserved blocks in system. + +What: /sys/fs/f2fs//gc_urgent +Date: August 2017 +Contact: "Jaegeuk Kim" +Description: + Do background GC agressively + +What: /sys/fs/f2fs//gc_urgent_sleep_time +Date: August 2017 +Contact: "Jaegeuk Kim" +Description: + Controls sleep time of GC urgent mode diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index d99faced79cb..6cf9ad12c57f 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -125,6 +125,7 @@ active_logs=%u Support configuring the number of active logs. In the disable_ext_identify Disable the extension list configured by mkfs, so f2fs does not aware of cold files such as media files. inline_xattr Enable the inline xattrs feature. +noinline_xattr Disable the inline xattrs feature. inline_data Enable the inline data feature: New created small(<~3.4k) files can be written into inode block. inline_dentry Enable the inline dir feature: data in new created @@ -159,6 +160,18 @@ mode=%s Control block allocation mode which supports "adaptive" writes towards main area. io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". +usrquota Enable plain user disk quota accounting. +grpquota Enable plain group disk quota accounting. +prjquota Enable plain project quota accounting. +usrjquota= Appoint specified file and type during mount, so that quota +grpjquota= information can be properly updated during recovery flow, +prjjquota= : must be in root directory; +jqfmt= : [vfsold,vfsv0,vfsv1]. +offusrjquota Turn off user journelled quota. +offgrpjquota Turn off group journelled quota. +offprjjquota Turn off project journelled quota. +quota Enable plain user disk quota accounting. +noquota Disable all plain disk quota option. ================================================================================ DEBUGFS ENTRIES @@ -204,6 +217,15 @@ Files in /sys/fs/f2fs/ gc_idle = 1 will select the Cost Benefit approach & setting gc_idle = 2 will select the greedy approach. + gc_urgent This parameter controls triggering background GCs + urgently or not. Setting gc_urgent = 0 [default] + makes back to default behavior, while if it is set + to 1, background thread starts to do GC by given + gc_urgent_sleep_time interval. + + gc_urgent_sleep_time This parameter controls sleep time for gc_urgent. + 500 ms is set by default. See above gc_urgent. + reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree segments is larger than the number of segments diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index f17684c48739..9f6607f17b53 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c new file mode 100644 index 000000000000..a91ed46fe503 --- /dev/null +++ b/fs/crypto/bio.c @@ -0,0 +1,143 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include "fscrypt_private.h" + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + PAGE_SIZE, 0, GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(0, bio); + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2d40ab9edc9f..c7835df7e7b8 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -24,10 +24,10 @@ #include #include #include -#include #include #include -#include +#include +#include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -44,7 +44,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static LIST_HEAD(fscrypt_free_ctxs); static DEFINE_SPINLOCK(fscrypt_ctx_lock); -static struct workqueue_struct *fscrypt_read_workqueue; +struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); static struct kmem_cache *fscrypt_ctx_cachep; @@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx) { unsigned long flags; - if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) { mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); ctx->w.bounce_page = NULL; } @@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx); * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) } else { ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; } - ctx->flags &= ~FS_WRITE_PATH_FL; + ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx; } EXPORT_SYMBOL(fscrypt_get_ctx); @@ -141,20 +141,15 @@ static void page_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -typedef enum { - FS_DECRYPT = 0, - FS_ENCRYPT, -} fscrypt_direction_t; - -static int do_page_crypto(struct inode *inode, - fscrypt_direction_t rw, pgoff_t index, - struct page *src_page, struct page *dest_page, - gfp_t gfp_flags) +int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags) { struct { __le64 index; - u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; - } xts_tweak; + u8 padding[FS_IV_SIZE - sizeof(__le64)]; + } iv; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -162,6 +157,18 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; + BUG_ON(len == 0); + + BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE); + BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE); + iv.index = cpu_to_le64(lblk_num); + memset(iv.padding, 0, sizeof(iv.padding)); + + if (ci->ci_essiv_tfm != NULL) { + crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv, + (u8 *)&iv); + } + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -174,15 +181,11 @@ static int do_page_crypto(struct inode *inode, req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, page_crypt_complete, &ecr); - BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(index); - memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); - sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); + sg_set_page(&src, src_page, len, offs); + skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -202,53 +205,86 @@ static int do_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags) { ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); - ctx->flags |= FS_WRITE_PATH_FL; + ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx->w.bounce_page; } /** * fscypt_encrypt_page() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * @gfp_flags: The gfp flag for memory allocation + * @inode: The inode for which the encryption should take place + * @page: The page to encrypt. Must be locked for bounce-page + * encryption. + * @len: Length of data to encrypt in @page and encrypted + * data in returned page. + * @offs: Offset of data within @page and returned + * page holding encrypted data. + * @lblk_num: Logical block number. This must be unique for multiple + * calls with same inode, except when overwriting + * previously written data. + * @gfp_flags: The gfp flag for memory allocation * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. + * Encrypts @page using the ctx encryption context. Performs encryption + * either in-place or into a newly allocated bounce page. + * Called on the page write path. * - * Called on the page write path. The caller must call + * Bounce page allocation is the default. + * In this case, the contents of @page are encrypted and stored in an + * allocated bounce page. @page has to be locked and the caller must call * fscrypt_restore_control_page() on the returned ciphertext page to * release the bounce buffer and the encryption context. * - * Return: An allocated page with the encrypted content on success. Else, an + * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in + * fscrypt_operations. Here, the input-page is returned with its content + * encrypted. + * + * Return: A page with the encrypted content on success. Else, an * error value or NULL. */ -struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page, gfp_t gfp_flags) +struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) + { struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; + struct page *ciphertext_page = page; int err; - BUG_ON(!PageLocked(plaintext_page)); + BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0); + + if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { + /* with inplace-encryption we just encrypt the page */ + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page, + ciphertext_page, len, offs, + gfp_flags); + if (err) + return ERR_PTR(err); + + return ciphertext_page; + } + + BUG_ON(!PageLocked(page)); ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *)ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; - ctx->w.control_page = plaintext_page; - err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, - gfp_flags); + ctx->w.control_page = page; + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, len, offs, + gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -265,8 +301,13 @@ errout: EXPORT_SYMBOL(fscrypt_encrypt_page); /** - * f2crypt_decrypt_page() - Decrypts a page in-place - * @page: The page to decrypt. Must be locked. + * fscrypt_decrypt_page() - Decrypts a page in-place + * @inode: The corresponding inode for the page to decrypt. + * @page: The page to decrypt. Must be locked in case + * it is a writeback page (FS_CFLG_OWN_PAGES unset). + * @len: Number of bytes in @page to be decrypted. + * @offs: Start of data in @page. + * @lblk_num: Logical block number. * * Decrypts page in-place using the ctx encryption context. * @@ -274,75 +315,17 @@ EXPORT_SYMBOL(fscrypt_encrypt_page); * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_decrypt_page(struct page *page) +int fscrypt_decrypt_page(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, u64 lblk_num) { - BUG_ON(!PageLocked(page)); + if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) + BUG_ON(!PageLocked(page)); - return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page, GFP_NOFS); + return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, + len, offs, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); -int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len) -{ - struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; - struct bio *bio; - int ret, err = 0; - - BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); - if (IS_ERR(ciphertext_page)) { - err = PTR_ERR(ciphertext_page); - goto errout; - } - - while (len--) { - err = do_page_crypto(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); - if (err) - goto errout; - - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } - bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = - pblk << (inode->i_sb->s_blocksize_bits - 9); - ret = bio_add_page(bio, ciphertext_page, - inode->i_sb->s_blocksize, 0); - if (ret != inode->i_sb->s_blocksize) { - /* should never happen! */ - WARN_ON(1); - bio_put(bio); - err = -EIO; - goto errout; - } - err = submit_bio_wait(WRITE, bio); - if ((err == 0) && bio->bi_error) - err = -EIO; - bio_put(bio); - if (err) - goto errout; - lblk++; - pblk++; - } - err = 0; -errout: - fscrypt_release_ctx(ctx); - return err; -} -EXPORT_SYMBOL(fscrypt_zeroout_range); - /* * Validate dentries for encrypted directories to make sure we aren't * potentially caching stale data after a key has been added or @@ -351,7 +334,6 @@ EXPORT_SYMBOL(fscrypt_zeroout_range); static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; - struct fscrypt_info *ci; int dir_has_key, cached_with_key; if (flags & LOOKUP_RCU) @@ -363,18 +345,11 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - ci = d_inode(dir)->i_crypt_info; - if (ci && ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD)))) - ci = NULL; - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); - dir_has_key = (ci != NULL); + dir_has_key = (d_inode(dir)->i_crypt_info != NULL); dput(dir); /* @@ -399,63 +374,6 @@ const struct dentry_operations fscrypt_d_ops = { }; EXPORT_SYMBOL(fscrypt_d_ops); -/* - * Call fscrypt_decrypt_page on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct fscrypt_ctx *ctx = - container_of(work, struct fscrypt_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_page(page); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else { - SetPageUptodate(page); - } - unlock_page(page); - } - fscrypt_release_ctx(ctx); - bio_put(bio); -} - -void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(fscrypt_read_workqueue, &ctx->r.work); -} -EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); - -void fscrypt_pullback_bio_page(struct page **page, bool restore) -{ - struct fscrypt_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct fscrypt_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - if (restore) - fscrypt_restore_control_page(bounce_page); -} -EXPORT_SYMBOL(fscrypt_pullback_bio_page); - void fscrypt_restore_control_page(struct page *page) { struct fscrypt_ctx *ctx; @@ -481,17 +399,22 @@ static void fscrypt_destroy(void) /** * fscrypt_initialize() - allocate major buffers for fs encryption. + * @cop_flags: fscrypt operations flags * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_initialize(void) +int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - if (fscrypt_bounce_page_pool) + /* + * No need to allocate a bounce page pool if there already is one or + * this FS won't use it. + */ + if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) return 0; mutex_lock(&fscrypt_init_mutex); @@ -520,7 +443,6 @@ fail: mutex_unlock(&fscrypt_init_mutex); return res; } -EXPORT_SYMBOL(fscrypt_initialize); /** * fscrypt_init() - Set up for fs encryption. @@ -562,6 +484,8 @@ static void __exit fscrypt_exit(void) destroy_workqueue(fscrypt_read_workqueue); kmem_cache_destroy(fscrypt_ctx_cachep); kmem_cache_destroy(fscrypt_info_cachep); + + fscrypt_essiv_cleanup(); } module_exit(fscrypt_exit); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 9b774f4b50c8..ad9f814fdead 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,7 +12,7 @@ #include #include -#include +#include "fscrypt_private.h" /** * fname_crypt_complete() - completion callback for filename crypto @@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode, static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + /** * digest_encode() - * @@ -209,7 +211,7 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) +u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) { int padding = 32; struct fscrypt_info *ci = inode->i_crypt_info; @@ -227,14 +229,17 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int fscrypt_fname_alloc_buffer(struct inode *inode, +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); + u32 olen = fscrypt_fname_encrypted_size(inode, ilen); + const u32 max_encoded_len = + max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); crypto_str->len = olen; - if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + olen = max(olen, max_encoded_len); + /* * Allocated buffer can hold one more character to null-terminate the * string @@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * + * If the key is available, we'll decrypt the disk name; otherwise, we'll encode + * it for presentation. Short names are directly base64-encoded, while long + * names are encoded in fscrypt_digested_name format. + * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, @@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - char buf[24]; + struct fscrypt_digested_name digested_name; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (inode->i_crypt_info) return fname_decrypt(inode, iname, oname); - if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { oname->len = digest_encode(iname->name, iname->len, oname->name); return 0; } if (hash) { - memcpy(buf, &hash, 4); - memcpy(buf + 4, &minor_hash, 4); + digested_name.hash = hash; + digested_name.minor_hash = minor_hash; } else { - memset(buf, 0, 8); + digested_name.hash = 0; + digested_name.minor_hash = 0; } - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(digested_name.digest, + FSCRYPT_FNAME_DIGEST(iname->name, iname->len), + FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + oname->len = 1 + digest_encode((const char *)&digested_name, + sizeof(digested_name), oname->name + 1); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -332,14 +345,39 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, * in a directory. Consequently, a user space name cannot be mapped to * a disk-space name */ - return -EACCES; + return -ENOKEY; } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); +/** + * fscrypt_setup_filename() - prepare to search a possibly encrypted directory + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * + * Given a user-provided filename @iname, this function sets @fname->disk_name + * to the name that would be stored in the on-disk directory entry, if possible. + * If the directory is unencrypted this is simply @iname. Else, if we have the + * directory's encryption key, then @iname is the plaintext, so we encrypt it to + * get the disk_name. + * + * Else, for keyless @lookup operations, @iname is the presented ciphertext, so + * we decode it to get either the ciphertext disk_name (for short names) or the + * fscrypt_digested_name (for long names). Non-@lookup operations will be + * impossible in this case, so we fail them with ENOKEY. + * + * If successful, fscrypt_free_filename() must be called later to clean up. + * + * Return: 0 on success, -errno on failure + */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - int ret = 0, bigname = 0; + int ret; + int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -350,7 +388,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = get_crypt_info(dir); + ret = fscrypt_get_encryption_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; @@ -367,31 +405,43 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } if (!lookup) - return -EACCES; + return -ENOKEY; /* * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) - return -ENOENT; + if (iname->name[0] == '_') { + if (iname->len != + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) + return -ENOENT; + digested = 1; + } else { + if (iname->len > + BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) + return -ENOENT; + digested = 0; + } - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + fname->crypto_buf.name = + kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, + sizeof(struct fscrypt_digested_name)), + GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, + ret = digest_decode(iname->name + digested, iname->len - digested, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hash, fname->crypto_buf.name, 4); - memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); + if (digested) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + fname->hash = n->hash; + fname->minor_hash = n->minor_hash; } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; @@ -403,12 +453,3 @@ errout: return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); - -void fscrypt_free_filename(struct fscrypt_name *fname) -{ - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; -} -EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h new file mode 100644 index 000000000000..79d79755d79b --- /dev/null +++ b/fs/crypto/fscrypt_private.h @@ -0,0 +1,107 @@ +/* + * fscrypt_private.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#ifndef _FSCRYPT_PRIVATE_H +#define _FSCRYPT_PRIVATE_H + +#include +#include + +/* Encryption parameters */ +#define FS_IV_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_128_CBC_KEY_SIZE 16 +#define FS_AES_128_CTS_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +/* + * A pointer to this structure is stored in the file system's in-core + * representation of an inode. + */ +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct crypto_cipher *ci_essiv_tfm; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } + +/* bio stuffs */ +#define REQ_OP_READ READ +#define REQ_OP_WRITE WRITE +#define bio_op(bio) ((bio)->bi_rw & 1) + +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_rw = op | op_flags; +} + +/* crypto.c */ +extern int fscrypt_initialize(unsigned int cop_flags); +extern struct workqueue_struct *fscrypt_read_workqueue; +extern int fscrypt_do_page_crypto(const struct inode *inode, + fscrypt_direction_t rw, u64 lblk_num, + struct page *src_page, + struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags); +extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags); + +/* keyinfo.c */ +extern void __exit fscrypt_essiv_cleanup(void); + +#endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 67fb6d8876d0..66e0728e9bbe 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,7 +10,12 @@ #include #include -#include +#include +#include +#include +#include "fscrypt_private.h" + +static struct crypto_shash *essiv_hash_tfm; static void derive_crypt_complete(struct crypto_async_request *req, int rc) { @@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. + * @derived_raw_key: Derived raw key. * * Return: Zero on success; non-zero otherwise. */ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], - u8 source_key[FS_AES_256_XTS_KEY_SIZE], - u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) + const struct fscrypt_key *source_key, + u8 derived_raw_key[FS_MAX_KEY_SIZE]) { int res = 0; struct skcipher_request *req = NULL; @@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], if (res < 0) goto out; - sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, - FS_AES_256_XTS_KEY_SIZE, NULL); + sg_init_one(&src_sg, source_key->raw, source_key->size); + sg_init_one(&dst_sg, derived_raw_key, source_key->size); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + NULL); res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); @@ -77,28 +82,25 @@ out: static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - u8 *prefix, int prefix_size) + const char *prefix, int min_keysize) { - u8 *full_key_descriptor; + char *description; struct key *keyring_key; struct fscrypt_key *master_key; const struct user_key_payload *ukp; - int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; int res; - full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); - if (!full_key_descriptor) + description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + if (!description) return -ENOMEM; - memcpy(full_key_descriptor, prefix, prefix_size); - sprintf(full_key_descriptor + prefix_size, - "%*phN", FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); - full_key_descriptor[full_key_len - 1] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - kfree(full_key_descriptor); + keyring_key = request_key(&key_type_logon, description, NULL); + kfree(description); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); + down_read(&keyring_key->sem); if (keyring_key->type != &key_type_logon) { printk_once(KERN_WARNING @@ -106,66 +108,68 @@ static int validate_user_key(struct fscrypt_info *crypt_info, res = -ENOKEY; goto out; } - down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; - up_read(&keyring_key->sem); goto out; } master_key = (struct fscrypt_key *)ukp->data; BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE + || master_key->size % AES_BLOCK_SIZE != 0) { printk_once(KERN_WARNING "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; - up_read(&keyring_key->sem); goto out; } - res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; - - crypt_info->ci_keyring_key = keyring_key; - return 0; + res = derive_key_aes(ctx->nonce, master_key, raw_key); out: + up_read(&keyring_key->sem); key_put(keyring_key); return res; } +static const struct { + const char *cipher_str; + int keysize; +} available_modes[] = { + [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", + FS_AES_256_XTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", + FS_AES_256_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", + FS_AES_128_CBC_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", + FS_AES_128_CTS_KEY_SIZE }, +}; + static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, const char **cipher_str_ret, int *keysize_ret) { + u32 mode; + + if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { + pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", + inode->i_ino, + ci->ci_data_mode, ci->ci_filename_mode); + return -EINVAL; + } + if (S_ISREG(inode->i_mode)) { - if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { - *cipher_str_ret = "xts(aes)"; - *keysize_ret = FS_AES_256_XTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported contents encryption mode " - "%d for inode %lu\n", - ci->ci_data_mode, inode->i_ino); - return -ENOKEY; + mode = ci->ci_data_mode; + } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + mode = ci->ci_filename_mode; + } else { + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return -EINVAL; } - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { - *cipher_str_ret = "cts(cbc(aes))"; - *keysize_ret = FS_AES_256_CTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported filenames encryption mode " - "%d for inode %lu\n", - ci->ci_filename_mode, inode->i_ino); - return -ENOKEY; - } - - pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", - (inode->i_mode & S_IFMT), inode->i_ino); - return -ENOKEY; + *cipher_str_ret = available_modes[mode].cipher_str; + *keysize_ret = available_modes[mode].keysize; + return 0; } static void put_crypt_info(struct fscrypt_info *ci) @@ -173,12 +177,78 @@ static void put_crypt_info(struct fscrypt_info *ci) if (!ci) return; - key_put(ci->ci_keyring_key); crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); kmem_cache_free(fscrypt_info_cachep, ci); } -int get_crypt_info(struct inode *inode) +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + desc->flags = 0; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +void __exit fscrypt_essiv_cleanup(void) +{ + crypto_free_shash(essiv_hash_tfm); +} + +int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -188,30 +258,24 @@ int get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; - res = fscrypt_initialize(); + if (inode->i_crypt_info) + return 0; + + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; - if (!inode->i_sb->s_cop->get_context) - return -EOPNOTSUPP; -retry: - crypt_info = ACCESS_ONCE(inode->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - fscrypt_put_encryption_info(inode, crypt_info); - goto retry; - } - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode)) + if (!fscrypt_dummy_context_enabled(inode) || + inode->i_sb->s_cop->is_encrypted(inode)) return res; + /* Fake up a context for an unencrypted directory */ + memset(&ctx, 0, sizeof(ctx)); ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); } else if (res != sizeof(ctx)) { return -EINVAL; } @@ -230,7 +294,7 @@ retry: crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; + crypt_info->ci_essiv_tfm = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -247,20 +311,12 @@ retry: if (!raw_key) goto out; - if (fscrypt_dummy_context_enabled(inode)) { - memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); - goto got_key; - } - - res = validate_user_key(crypt_info, &ctx, raw_key, - FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, + keysize); if (res && inode->i_sb->s_cop->key_prefix) { - u8 *prefix = NULL; - int prefix_size, res2; - - prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); - res2 = validate_user_key(crypt_info, &ctx, raw_key, - prefix, prefix_size); + int res2 = validate_user_key(crypt_info, &ctx, raw_key, + inode->i_sb->s_cop->key_prefix, + keysize); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -269,30 +325,35 @@ retry: } else if (res) { goto out; } -got_key: ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); + pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", + __func__, res, inode->i_ino); goto out; } crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + /* + * if the provided key is longer than keysize, we use the first + * keysize bytes of the derived key only + */ res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; - kzfree(raw_key); - raw_key = NULL; - if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { - put_crypt_info(crypt_info); - goto retry; + if (S_ISREG(inode->i_mode) && + crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { + res = init_essiv_generator(crypt_info, raw_key, keysize); + if (res) { + pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", + __func__, res, inode->i_ino); + goto out; + } } - return 0; - + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY) res = 0; @@ -300,6 +361,7 @@ out: kzfree(raw_key); return res; } +EXPORT_SYMBOL(fscrypt_get_encryption_info); void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) { @@ -317,17 +379,3 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) put_crypt_info(ci); } EXPORT_SYMBOL(fscrypt_put_encryption_info); - -int fscrypt_get_encryption_info(struct inode *inode) -{ - struct fscrypt_info *ci = inode->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return get_crypt_info(inode); - return 0; -} -EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6865663aac69..9914d51dff86 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,76 +10,37 @@ #include #include -#include #include - -static int inode_has_encryption_context(struct inode *inode) -{ - if (!inode->i_sb->s_cop->get_context) - return 0; - return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); -} +#include "fscrypt_private.h" /* - * check whether the policy is consistent with the encryption context - * for the inode + * check whether an encryption policy is consistent with an encryption context */ -static int is_encryption_context_consistent_with_policy(struct inode *inode, +static bool is_encryption_context_consistent_with_policy( + const struct fscrypt_context *ctx, const struct fscrypt_policy *policy) { - struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->get_context) - return 0; - - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); + return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx->flags == policy->flags) && + (ctx->contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx->filenames_encryption_mode == + policy->filenames_encryption_mode); } static int create_encryption_context_from_policy(struct inode *inode, const struct fscrypt_policy *policy) { struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - - if (inode->i_sb->s_cop->prepare_context) { - res = inode->i_sb->s_cop->prepare_context(inode); - if (res) - return res; - } ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); - if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) return -EINVAL; - } - - if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); - return -EINVAL; - } if (policy->flags & ~FS_POLICY_FLAGS_VALID) return -EINVAL; @@ -93,16 +54,20 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct file *filp, - const struct fscrypt_policy *policy) +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + struct fscrypt_context ctx; + + if (copy_from_user(&policy, arg, sizeof(policy))) + return -EFAULT; if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy->version != 0) + if (policy.version != 0) return -EINVAL; ret = mnt_want_write_file(filp); @@ -111,22 +76,23 @@ int fscrypt_process_policy(struct file *filp, inode_lock(inode); - if (!inode_has_encryption_context(inode)) { + ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) - ret = -EINVAL; - else if (!inode->i_sb->s_cop->empty_dir) - ret = -EOPNOTSUPP; + ret = -ENOTDIR; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else ret = create_encryption_context_from_policy(inode, - policy); - } else if (!is_encryption_context_consistent_with_policy(inode, - policy)) { - printk(KERN_WARNING - "%s: Policy inconsistent with encryption context\n", - __func__); - ret = -EINVAL; + &policy); + } else if (ret == sizeof(ctx) && + is_encryption_context_consistent_with_policy(&ctx, + &policy)) { + /* The file already uses the same encryption policy. */ + ret = 0; + } else if (ret >= 0 || ret == -ERANGE) { + /* The file already uses a different encryption policy. */ + ret = -EEXIST; } inode_unlock(inode); @@ -134,49 +100,94 @@ int fscrypt_process_policy(struct file *filp, mnt_drop_write_file(filp); return ret; } -EXPORT_SYMBOL(fscrypt_process_policy); +EXPORT_SYMBOL(fscrypt_ioctl_set_policy); -int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { + struct inode *inode = file_inode(filp); struct fscrypt_context ctx; + struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->get_context || - !inode->i_sb->s_cop->is_encrypted(inode)) + if (!inode->i_sb->s_cop->is_encrypted(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0 && res != -ERANGE) + return res; if (res != sizeof(ctx)) - return -ENODATA; + return -EINVAL; if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + policy.version = 0; + policy.contents_encryption_mode = ctx.contents_encryption_mode; + policy.filenames_encryption_mode = ctx.filenames_encryption_mode; + policy.flags = ctx.flags; + memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); + + if (copy_to_user(arg, &policy, sizeof(policy))) + return -EFAULT; return 0; } -EXPORT_SYMBOL(fscrypt_get_policy); +EXPORT_SYMBOL(fscrypt_ioctl_get_policy); +/** + * fscrypt_has_permitted_context() - is a file's encryption policy permitted + * within its directory? + * + * @parent: inode for parent directory + * @child: inode for file being looked up, opened, or linked into @parent + * + * Filesystems must call this before permitting access to an inode in a + * situation where the parent directory is encrypted (either before allowing + * ->lookup() to succeed, or for a regular file before allowing it to be opened) + * and before any operation that involves linking an inode into an encrypted + * directory, including link, rename, and cross rename. It enforces the + * constraint that within a given encrypted directory tree, all files use the + * same encryption policy. The pre-access check is needed to detect potentially + * malicious offline violations of this constraint, while the link and rename + * checks are needed to prevent online violations of this constraint. + * + * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail + * the filesystem operation with EPERM. + */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - struct fscrypt_info *parent_ci, *child_ci; + const struct fscrypt_operations *cops = parent->i_sb->s_cop; + const struct fscrypt_info *parent_ci, *child_ci; + struct fscrypt_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - printk(KERN_ERR "parent %p child %p\n", parent, child); - BUG_ON(1); - } - - /* no restrictions if the parent directory is not encrypted */ - if (!parent->i_sb->s_cop->is_encrypted(parent)) + /* No restrictions on file types which are never encrypted */ + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && + !S_ISLNK(child->i_mode)) return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!parent->i_sb->s_cop->is_encrypted(child)) + + /* No restrictions if the parent directory is unencrypted */ + if (!cops->is_encrypted(parent)) + return 1; + + /* Encrypted directories must not contain unencrypted files */ + if (!cops->is_encrypted(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = fscrypt_get_encryption_info(parent); if (res) return 0; @@ -185,17 +196,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 0; parent_ci = parent->i_crypt_info; child_ci = child->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } EXPORT_SYMBOL(fscrypt_has_permitted_context); @@ -204,9 +230,9 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context); * @parent: Parent inode from which the context is inherited. * @child: Child inode that inherits the context from @parent. * @fs_data: private data given by FS. - * @preload: preload child i_crypt_info + * @preload: preload child i_crypt_info if true * - * Return: Zero on success, non-zero otherwise + * Return: 0 on success, -errno on failure */ int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload) @@ -215,9 +241,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, struct fscrypt_info *ci; int res; - if (!parent->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - res = fscrypt_get_encryption_info(parent); if (res < 0) return res; @@ -227,19 +250,11 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return -ENOKEY; ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - if (fscrypt_dummy_context_enabled(parent)) { - ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE); - } + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); res = parent->i_sb->s_cop->set_context(child, &ctx, sizeof(ctx), fs_data); diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ca949ea7c02f..a0dc559b1b47 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o -f2fs-y += shrinker.o extent_cache.o +f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index a45d1f4b7b0f..112f8e04c549 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -210,15 +210,16 @@ static int __f2fs_set_acl(struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (acl && !ipage) { + error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - set_acl_inode(inode, inode->i_mode); + set_acl_inode(inode, mode); } break; @@ -236,7 +237,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { clear_inode_flag(inode, FI_ACL_MODE); - return (int)PTR_ERR(value); + return PTR_ERR(value); } } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 45ef3b6bfb04..e86f67ac96c6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -31,7 +31,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) set_ckpt_flags(sbi, CP_ERROR_FLAG); sbi->sb->s_flags |= MS_RDONLY; if (!end_io) - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); } /* @@ -163,6 +163,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, + .in_list = false, }; struct blk_plug plug; @@ -208,12 +209,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; - fio.old_blkaddr = fio.new_blkaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_bio(&fio); f2fs_put_page(page, 0); } out: - f2fs_submit_merged_bio(sbi, META, READ); blk_finish_plug(&plug); return blkno - start; } @@ -232,8 +231,9 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -246,16 +246,17 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - write_meta_page(sbi, page); + write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, META); unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, META); return 0; @@ -264,23 +265,33 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, META); + /* if locked failed, cp will flush dirty pages instead */ + if (!mutex_trylock(&sbi->cp_mutex)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - mutex_lock(&sbi->cp_mutex); + trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write); + written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -292,7 +303,7 @@ skip_write: } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write) + long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; @@ -343,7 +354,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (mapping->a_ops->writepage(page, &wbc)) { + if (__f2fs_write_meta_page(page, &wbc, io_type)) { unlock_page(page); break; } @@ -357,7 +368,7 @@ continue_unlock: } stop: if (nwritten) - f2fs_submit_merged_bio(sbi, type, WRITE); + f2fs_submit_merged_write(sbi, type); blk_finish_plug(&plug); @@ -494,6 +505,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); + f2fs_show_injection_info(FAULT_ORPHAN); return -ENOSPC; } #endif @@ -566,7 +578,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (ni.blk_addr != NULL_ADDR) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", + "%s: orphan failed (ino=%x) by kernel, retry mount.", __func__, ino); return -EIO; } @@ -577,11 +589,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - int err; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -597,14 +622,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); - return err; + goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); - return 0; +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + + return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) @@ -676,14 +708,13 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); - if (crc_offset >= blk_size) { + if (crc_offset > (blk_size - sizeof(__le32))) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; } - crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block - + crc_offset))); + crc = cur_cp_crc(*cp_block); if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); return -EINVAL; @@ -816,7 +847,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + if (!f2fs_is_volatile_file(inode)) + list_add_tail(&F2FS_I(inode)->dirty_list, + &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } @@ -874,6 +907,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) struct inode *inode; struct f2fs_inode_info *fi; bool is_dir = (type == DIR_INODE); + unsigned long ino = 0; trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, get_pages(sbi, is_dir ? @@ -896,14 +930,30 @@ retry: inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { + unsigned long cur_ino = inode->i_ino; + + if (is_dir) + F2FS_I(inode)->cp_task = current; + filemap_fdatawrite(inode->i_mapping); + + if (is_dir) + F2FS_I(inode)->cp_task = NULL; + iput(inode); + /* We need to give cpu to another writers. */ + if (ino == cur_ino) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + } else { + ino = cur_ino; + } } else { /* * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_write(sbi, DATA); cond_resched(); } goto retry; @@ -941,6 +991,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) return 0; } +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -964,33 +1027,47 @@ retry_flush_dents: err = sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; - goto retry_flush_dents; - } - - if (get_pages(sbi, F2FS_DIRTY_IMETA)) { - f2fs_unlock_all(sbi); - err = f2fs_sync_inode_meta(sbi); - if (err) - goto out; + cond_resched(); goto retry_flush_dents; } /* * POR: we should ensure that there are no dirty node pages - * until finishing nat/sit flush. + * until finishing nat/sit flush. inode->i_blocks can be updated. */ + down_write(&sbi->node_change); + + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) + goto out; + cond_resched(); + goto retry_flush_dents; + } + retry_flush_nodes: down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc); + err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); goto out; } + cond_resched(); goto retry_flush_nodes; } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + up_write(&sbi->node_change); out: blk_finish_plug(&plug); return err; @@ -999,8 +1076,6 @@ out: static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - - build_free_nids(sbi, false); f2fs_unlock_all(sbi); } @@ -1023,15 +1098,24 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) { unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long flags; - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); - if (cpc->reason == CP_UMOUNT) + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - if (cpc->reason == CP_FASTBOOT) + if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); @@ -1047,15 +1131,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; - nid_t last_nid = nm_i->next_scan_nid; + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1067,19 +1150,16 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } - next_free_nid(sbi, &last_nid); - /* * modify checkpoint * version number is already updated */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); - ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = @@ -1098,18 +1178,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); - ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); - ckpt->next_free_nid = cpu_to_le32(last_nid); - /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -1138,6 +1214,27 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); + /* write nat bits */ + if (enabled_nat_bits(sbi, cpc)) { + __u64 cp_ver = cur_cp_version(ckpt); + block_t blk; + + cp_ver |= ((__u64)crc32 << 32); + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); + + blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) + update_meta_page(sbi, nm_i->nat_bits + + (i << F2FS_BLKSIZE_BITS), blk + i); + + /* Flush all the NAT BITS pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) { + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + } + } + /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) @@ -1187,7 +1284,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); @@ -1226,8 +1323,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || - (cpc->reason == CP_DISCARD && !sbi->discard_blks))) + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1246,10 +1343,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { if (!exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; @@ -1274,7 +1371,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi); + flush_nat_entries(sbi, cpc); flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ @@ -1287,7 +1384,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); - if (cpc->reason == CP_RECOVERY) + if (cpc->reason & CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c deleted file mode 100644 index 18595d7a0efc..000000000000 --- a/fs/f2fs/crypto_key.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * linux/fs/f2fs/crypto_key.c - * - * Copied from linux/fs/f2fs/crypto_key.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption key functions for f2fs - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. - */ -#include -#include -#include -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct f2fs_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - -/** - * f2fs_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivatio. - * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. - * - * Return: Zero on success; non-zero otherwise. - */ -static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE], - char source_key[F2FS_AES_256_XTS_KEY_SIZE], - char derived_key[F2FS_AES_256_XTS_KEY_SIZE]) -{ - int res = 0; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct scatterlist src_sg, dst_sg; - struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, - 0); - - if (IS_ERR(tfm)) { - res = PTR_ERR(tfm); - tfm = NULL; - goto out; - } - crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); - req = ablkcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - res = -ENOMEM; - goto out; - } - ablkcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); - res = crypto_ablkcipher_setkey(tfm, deriving_key, - F2FS_AES_128_ECB_KEY_SIZE); - if (res < 0) - goto out; - - sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, - F2FS_AES_256_XTS_KEY_SIZE, NULL); - res = crypto_ablkcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } -out: - if (req) - ablkcipher_request_free(req); - if (tfm) - crypto_free_ablkcipher(tfm); - return res; -} - -static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) -{ - if (!ci) - return; - - crypto_free_ablkcipher(ci->ci_ctfm); - kmem_cache_free(f2fs_crypt_info_cachep, ci); -} - -void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_crypt_info *prev; - - if (ci == NULL) - ci = ACCESS_ONCE(fi->i_crypt_info); - if (ci == NULL) - return; - prev = cmpxchg(&fi->i_crypt_info, ci, NULL); - if (prev != ci) - return; - - f2fs_free_crypt_info(ci); -} - -int f2fs_get_encryption_info(struct inode *inode) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_crypt_info *crypt_info; - char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + - (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1]; - struct key *keyring_key = NULL; - struct f2fs_encryption_key *master_key; - struct f2fs_encryption_context ctx; - const struct user_key_payload *ukp; - struct crypto_ablkcipher *ctfm; - const char *cipher_str; - char raw_key[F2FS_MAX_KEY_SIZE]; - char mode; - int res; - - if (fi->i_crypt_info) - return 0; - - res = f2fs_crypto_initialize(); - if (res) - return res; - - res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx), NULL); - if (res < 0) - return res; - else if (res != sizeof(ctx)) - return -EINVAL; - res = 0; - - crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS); - if (!crypt_info) - return -ENOMEM; - - crypt_info->ci_flags = ctx.flags; - crypt_info->ci_data_mode = ctx.contents_encryption_mode; - crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; - crypt_info->ci_ctfm = NULL; - memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, - sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - - switch (mode) { - case F2FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case F2FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "f2fs: unsupported key mode %d (ino %u)\n", - mode, (unsigned) inode->i_ino); - res = -ENOKEY; - goto out; - } - - memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE, - "%*phN", F2FS_KEY_DESCRIPTOR_SIZE, - ctx.master_key_descriptor); - full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + - (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - if (IS_ERR(keyring_key)) { - res = PTR_ERR(keyring_key); - keyring_key = NULL; - goto out; - } - BUG_ON(keyring_key->type != &key_type_logon); - ukp = user_key_payload(keyring_key); - if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { - res = -EINVAL; - goto out; - } - master_key = (struct f2fs_encryption_key *)ukp->data; - BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE != - F2FS_KEY_DERIVATION_NONCE_SIZE); - BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE); - res = f2fs_derive_key_aes(ctx.nonce, master_key->raw, - raw_key); - if (res) - goto out; - - ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); - goto out; - } - crypt_info->ci_ctfm = ctfm; - crypto_ablkcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(ctfm, raw_key, - f2fs_encryption_key_size(mode)); - if (res) - goto out; - - if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) == NULL) - crypt_info = NULL; -out: - if (res == -ENOKEY && !S_ISREG(inode->i_mode)) - res = 0; - key_put(keyring_key); - f2fs_free_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); - return res; -} - -int f2fs_has_encryption_key(struct inode *inode) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - - return (fi->i_crypt_info != NULL); -} diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c deleted file mode 100644 index 884f3f0fe29d..000000000000 --- a/fs/f2fs/crypto_policy.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * copied from linux/fs/ext4/crypto_policy.c - * - * Copyright (C) 2015, Google, Inc. - * Copyright (C) 2015, Motorola Mobility. - * - * This contains encryption policy functions for f2fs with some modifications - * to support f2fs-specific xattr APIs. - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -static int f2fs_inode_has_encryption_context(struct inode *inode) -{ - int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL); - return (res > 0); -} - -/* - * check whether the policy is consistent with the encryption context - * for the inode - */ -static int f2fs_is_encryption_context_consistent_with_policy( - struct inode *inode, const struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), NULL); - - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); -} - -static int f2fs_create_encryption_context_from_policy( - struct inode *inode, const struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - - ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; - memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE); - - if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); - return -EINVAL; - } - - if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); - return -EINVAL; - } - - if (policy->flags & ~F2FS_POLICY_FLAGS_VALID) - return -EINVAL; - - ctx.contents_encryption_mode = policy->contents_encryption_mode; - ctx.filenames_encryption_mode = policy->filenames_encryption_mode; - ctx.flags = policy->flags; - BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE); - get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); - - return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), NULL, XATTR_CREATE); -} - -int f2fs_process_policy(const struct f2fs_encryption_policy *policy, - struct inode *inode) -{ - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (policy->version != 0) - return -EINVAL; - - if (!S_ISDIR(inode->i_mode)) - return -EINVAL; - - if (!f2fs_inode_has_encryption_context(inode)) { - if (!f2fs_empty_dir(inode)) - return -ENOTEMPTY; - return f2fs_create_encryption_context_from_policy(inode, - policy); - } - - if (f2fs_is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; -} - -int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - int res; - - if (!f2fs_encrypted_inode(inode)) - return -ENODATA; - - res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx), NULL); - if (res != sizeof(ctx)) - return -ENODATA; - if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1) - return -EINVAL; - - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE); - return 0; -} - -int f2fs_is_child_context_consistent_with_parent(struct inode *parent, - struct inode *child) -{ - const struct f2fs_crypt_info *parent_ci, *child_ci; - struct f2fs_encryption_context parent_ctx, child_ctx; - int res; - - /* No restrictions on file types which are never encrypted */ - if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && - !S_ISLNK(child->i_mode)) - return 1; - - /* No restrictions if the parent directory is unencrypted */ - if (!f2fs_encrypted_inode(parent)) - return 1; - - /* Encrypted directories must not contain unencrypted files */ - if (!f2fs_encrypted_inode(child)) - return 0; - - /* - * Both parent and child are encrypted, so verify they use the same - * encryption policy. Compare the fscrypt_info structs if the keys are - * available, otherwise retrieve and compare the fscrypt_contexts. - * - * Note that the fscrypt_context retrieval will be required frequently - * when accessing an encrypted directory tree without the key. - * Performance-wise this is not a big deal because we already don't - * really optimize for file access without the key (to the extent that - * such access is even possible), given that any attempted access - * already causes a fscrypt_context retrieval and keyring search. - * - * In any case, if an unexpected error occurs, fall back to "forbidden". - */ - - res = f2fs_get_encryption_info(parent); - if (res) - return 0; - res = f2fs_get_encryption_info(child); - if (res) - return 0; - parent_ci = F2FS_I(parent)->i_crypt_info; - child_ci = F2FS_I(child)->i_crypt_info; - if (parent_ci && child_ci) { - return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == - child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags); - } - - res = f2fs_getxattr(parent, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &parent_ctx, sizeof(parent_ctx), NULL); - if (res != sizeof(parent_ctx)) - return 0; - - res = f2fs_getxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &child_ctx, sizeof(child_ctx), NULL); - if (res != sizeof(child_ctx)) - return 0; - - return memcmp(parent_ctx.master_key_descriptor, - child_ctx.master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ctx.contents_encryption_mode == - child_ctx.contents_encryption_mode) && - (parent_ctx.filenames_encryption_mode == - child_ctx.filenames_encryption_mode) && - (parent_ctx.flags == child_ctx.flags); -} - -/** - * f2fs_inherit_context() - Sets a child context from its parent - * @parent: Parent inode from which the context is inherited. - * @child: Child inode that inherits the context from @parent. - * - * Return: Zero on success, non-zero otherwise - */ -int f2fs_inherit_context(struct inode *parent, struct inode *child, - struct page *ipage) -{ - struct f2fs_encryption_context ctx; - struct f2fs_crypt_info *ci; - int res; - - res = f2fs_get_encryption_info(parent); - if (res < 0) - return res; - - ci = F2FS_I(parent)->i_crypt_info; - BUG_ON(ci == NULL); - - ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; - - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE); - - get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); - return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), ipage, XATTR_CREATE); -} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4ac72a3f920a..c8583d7a1845 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -56,8 +56,10 @@ static void f2fs_read_end_io(struct bio *bio) int i; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { + f2fs_show_injection_info(FAULT_IO); bio->bi_error = -EIO; + } #endif if (f2fs_bio_encrypted(bio)) { @@ -223,7 +225,7 @@ submit_io: trace_f2fs_submit_read_bio(sbi->sb, type, bio); else trace_f2fs_submit_write_bio(sbi->sb, type, bio); - submit_bio(0, bio); + submit_bio(bio_op(bio), bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) @@ -244,8 +246,8 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) io->bio = NULL; } -static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, - struct page *page, nid_t ino) +static bool __has_merged_page(struct f2fs_bio_info *io, + struct inode *inode, nid_t ino, pgoff_t idx) { struct bio_vec *bvec; struct page *target; @@ -254,7 +256,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, if (!io->bio) return false; - if (!inode && !page && !ino) + if (!inode && !ino) return true; bio_for_each_segment_all(bvec, io->bio, i) { @@ -264,10 +266,11 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, else target = fscrypt_control_page(bvec->bv_page); + if (idx != target->index) + continue; + if (inode && inode == target->mapping->host) return true; - if (page && page == target) - return true; if (ino && ino == ino_of_node(target)) return true; } @@ -276,70 +279,88 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, } static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, nid_t ino, - enum page_type type) + nid_t ino, pgoff_t idx, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - bool ret; + enum temp_type temp; + struct f2fs_bio_info *io; + bool ret = false; - down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, page, ino); - up_read(&io->io_rwsem); + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + io = sbi->write_io[btype] + temp; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, ino, idx); + up_read(&io->io_rwsem); + + /* TODO: use HOT temp only for meta pages now. */ + if (ret || btype == META) + break; + } return ret; } -static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) +static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io; - - io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; down_write(&io->io_rwsem); - if (!__has_merged_page(io, inode, page, ino)) - goto out; - /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; + io->fio.op_flags = REQ_META | REQ_PRIO; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_FUA; + io->fio.op_flags |= WRITE_FLUSH | REQ_FUA; } __submit_merged_bio(io); -out: up_write(&io->io_rwsem); } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, - int rw) +static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, bool force) { - __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); + enum temp_type temp; + + if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + return; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + + __f2fs_submit_merged_write(sbi, type, temp); + + /* TODO: use HOT temp only for meta pages now. */ + if (type >= META) + break; + } } -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - if (has_merged_page(sbi, inode, page, ino, type)) - __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); + __submit_merged_write_cond(sbi, NULL, 0, 0, type, true); } -void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + __submit_merged_write_cond(sbi, inode, ino, idx, type, false); +} + +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) +{ + f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_write(sbi, NODE); + f2fs_submit_merged_write(sbi, META); } /* * Fill the locked page with data located in the block address. - * Return unlocked page. + * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { @@ -360,19 +381,35 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); + + if (!is_read_io(fio->op)) + inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); return 0; } -int f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); - struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->op); + struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; int err = 0; - io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + f2fs_bug_on(sbi, is_read_io(fio->op)); + + down_write(&io->io_rwsem); +next: + if (fio->in_list) { + spin_lock(&io->io_lock); + if (list_empty(&io->io_list)) { + spin_unlock(&io->io_lock); + goto out_fail; + } + fio = list_first_entry(&io->io_list, + struct f2fs_io_info, list); + list_del(&fio->list); + spin_unlock(&io->io_lock); + } if (fio->old_blkaddr != NEW_ADDR) verify_block_addr(sbi, fio->old_blkaddr); @@ -380,10 +417,10 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (!is_read) - inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + /* set submitted = 1 as a return value */ + fio->submitted = 1; - down_write(&io->io_rwsem); + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || @@ -398,32 +435,86 @@ alloc_new: goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, - BIO_MAX_PAGES, is_read); + BIO_MAX_PAGES, false); io->fio = *fio; } - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); + + trace_f2fs_submit_page_write(fio->page, fio); + + if (fio->in_list) + goto next; out_fail: up_write(&io->io_rwsem); - trace_f2fs_submit_page_mbio(fio->page, fio); return err; } +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct bio *bio; + + if (f2fs_encrypted_file(inode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_block_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr) +{ + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; +} + static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); - addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -451,14 +542,15 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err; if (!count) return 0; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); @@ -466,8 +558,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + block_t blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -509,7 +601,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei; + struct extent_info ei = {0,0,0}; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -526,18 +618,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err; - struct f2fs_io_info fio = { - .sbi = F2FS_I_SB(inode), - .type = DATA, - .op = REQ_OP_READ, - .op_flags = op_flags, - .encrypted_page = NULL, - }; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) @@ -578,9 +660,7 @@ got_it: return page; } - fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.page = page; - err = f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); if (err) goto put_err; return page; @@ -709,23 +789,25 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct node_info ni; pgoff_t fofs; blkcnt_t count = 1; + int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, CURSEG_WARM_DATA); + &sum, CURSEG_WARM_DATA, NULL, false); set_data_blkaddr(dn); /* update i_size */ @@ -739,7 +821,7 @@ alloc: static inline bool __force_buffered_io(struct inode *inode, int rw) { - return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + return (f2fs_encrypted_file(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } @@ -750,6 +832,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; int err = 0; + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + return 0; + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); if (map.m_len > map.m_lblk) @@ -768,7 +853,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -778,6 +863,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } +static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + down_read(&sbi->node_change); + else + up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } +} + /* * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with * f2fs_map_blocks structure. @@ -798,7 +898,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei; + struct extent_info ei = {0,0,0}; block_t blkaddr; if (!maxblocks) @@ -820,7 +920,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_dnode: if (create) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -842,7 +942,7 @@ next_dnode: end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { @@ -862,7 +962,7 @@ next_block: } if (err) goto sync_out; - map->m_flags = F2FS_MAP_NEW; + map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { if (flag == F2FS_GET_BLOCK_BMAP) { @@ -930,7 +1030,7 @@ skip: f2fs_put_dnode(&dn); if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -939,7 +1039,7 @@ sync_out: f2fs_put_dnode(&dn); unlock_out: if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -962,7 +1062,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = map.m_len << inode->i_blkbits; + bh->b_size = (u64)map.m_len << inode->i_blkbits; } return err; } @@ -979,7 +1079,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1085,35 +1185,6 @@ out: return ret; } -static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; - struct bio *bio; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - } - - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - f2fs_target_device(sbi, blkaddr, bio); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - - return bio; -} - /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1142,9 +1213,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - prefetchw(&page->flags); if (pages) { page = list_last_entry(pages, struct page, lru); + + prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) @@ -1177,7 +1249,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_DEFAULT)) goto set_error_page; } got_it: @@ -1208,12 +1280,11 @@ submit_and_realloc: bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; } - bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1273,17 +1344,84 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } +static int encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_file(inode)) + return 0; + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + PAGE_SIZE, 0, fio->page->index, gfp_flags); + if (!IS_ERR(fio->encrypted_page)) + return 0; + + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_writes(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) + return false; + if (is_cold_data(fio->page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return false; + + return need_inplace_update_policy(inode, fio); +} + +static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) +{ + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (fio->old_blkaddr == NULL_ADDR) + return false; + return true; +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; + struct extent_info ei = {0,0,0}; + bool ipu_force = false; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + + if (valid_ipu_blkaddr(fio)) { + ipu_force = true; + fio->need_lock = LOCK_DONE; + goto got_it; + } + } + + /* Deadlock due to between page->lock and f2fs_lock_op */ + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + return -EAGAIN; + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) - return err; + goto out; fio->old_blkaddr = dn.data_blkaddr; @@ -1292,57 +1430,57 @@ int do_write_data_page(struct f2fs_io_info *fio) ClearPageUptodate(page); goto out_writepage; } - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - gfp_t gfp_flags = GFP_NOFS; - - /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->old_blkaddr); -retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - gfp_flags); - if (IS_ERR(fio->encrypted_page)) { - err = PTR_ERR(fio->encrypted_page); - if (err == -ENOMEM) { - /* flush pending ios and wait for a while */ - f2fs_flush_merged_bios(F2FS_I_SB(inode)); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - err = 0; - goto retry_encrypt; - } - goto out_writepage; - } - } - - set_page_writeback(page); - +got_it: /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->old_blkaddr != NEW_ADDR && - !is_cold_data(page) && - !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode))) { - rewrite_data_page(fio); + if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + f2fs_put_dnode(&dn); + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); + err = rewrite_data_page(fio); + trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); - trace_f2fs_do_write_data_page(page, IPU); - } else { - write_data_page(&dn, fio); - trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + return err; } + + if (fio->need_lock == LOCK_RETRY) { + if (!f2fs_trylock_op(fio->sbi)) { + err = -EAGAIN; + goto out_writepage; + } + fio->need_lock = LOCK_REQ; + } + + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + + /* LFS mode write path */ + write_data_page(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); +out: + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); return err; } -static int f2fs_write_data_page(struct page *page, - struct writeback_control *wbc) +static int __write_data_page(struct page *page, bool *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1358,12 +1496,19 @@ static int f2fs_write_data_page(struct page *page, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .submitted = false, + .need_lock = LOCK_RETRY, + .io_type = io_type, }; trace_f2fs_writepage(page, DATA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (page->index < end_index) goto write; @@ -1377,8 +1522,6 @@ static int f2fs_write_data_page(struct page *page, zero_user_segment(page, offset, PAGE_SIZE); write: - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; /* we should not write 0'th page having journal header */ @@ -1395,6 +1538,7 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); goto done; } @@ -1403,16 +1547,26 @@ write: need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; - f2fs_lock_op(sbi); - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, page); - if (err == -EAGAIN) + if (!err) + goto out; + } + + if (err == -EAGAIN) { err = do_write_data_page(&fio); + if (err == -EAGAIN) { + fio.need_lock = LOCK_REQ; + err = do_write_data_page(&fio); + } + } if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - f2fs_unlock_op(sbi); + done: if (err && err != -ENOENT) goto redirty_out; @@ -1423,15 +1577,23 @@ out: ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); + clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); + submitted = NULL; } unlock_page(page); - f2fs_balance_fs(sbi, need_balance_fs); + if (!S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi, need_balance_fs); - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, DATA, WRITE); + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, DATA); + submitted = NULL; + } + + if (submitted) + *submitted = fio.submitted; return 0; @@ -1443,13 +1605,20 @@ redirty_out: return err; } +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_data_page(page, NULL, wbc, FS_DATA_IO); +} + /* * This function was copied from write_cche_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { int ret = 0; int done = 0; @@ -1459,13 +1628,19 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; + pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; int tag; - int nwritten = 0; pagevec_init(&pvec, 0); + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1499,6 +1674,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (page->index > end) { done = 1; @@ -1506,7 +1682,7 @@ retry: } done_index = page->index; - +retry_write: lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1532,7 +1708,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = mapping->a_ops->writepage(page, wbc); + ret = __write_data_page(page, &submitted, wbc, io_type); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1542,16 +1718,27 @@ continue_unlock: unlock_page(page); ret = 0; continue; + } else if (ret == -EAGAIN) { + ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, + HZ/50); + goto retry_write; + } + continue; } done_index = page->index + 1; done = 1; break; - } else { - nwritten++; + } else if (submitted) { + last_idx = page->index; } - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { + /* give a priority to WB_SYNC threads */ + if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || + --wbc->nr_to_write <= 0) && + wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } @@ -1569,15 +1756,16 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; - if (nwritten) - f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, - NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, + 0, last_idx, DATA); return ret; } -static int f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc) +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1592,6 +1780,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) @@ -1601,15 +1793,20 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (is_inode_flag_set(inode, FI_DO_DEFRAG)) goto skip_write; - /* during POR, we don't need to trigger writepage at all. */ - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, DATA); + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req); + else if (atomic_read(&sbi->wb_sync_req)) + goto skip_write; + blk_start_plug(&plug); - ret = f2fs_write_cache_pages(mapping, wbc); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. @@ -1624,14 +1821,26 @@ skip_write: return 0; } +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); if (to > i_size) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); truncate_blocks(inode, i_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -1644,19 +1853,20 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err = 0; /* * we already allocated all the blocks, so we don't need to get * the block addresses when there is no need to fill the page. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && + !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); locked = true; } restart: @@ -1670,7 +1880,7 @@ restart: set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { + if (pos + len <= MAX_INLINE_DATA(inode)) { read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) @@ -1692,7 +1902,8 @@ restart: err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); locked = true; goto restart; } @@ -1706,7 +1917,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); return err; } @@ -1745,7 +1956,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } repeat: - page = grab_cache_page_write_begin(mapping, index, flags); + /* + * Do not use grab_cache_page_write_begin() to avoid deadlock due to + * wait_for_stable_page. Will wait that below with our IO control. + */ + page = grab_cache_page(mapping, index); if (!page) { err = -ENOMEM; goto fail; @@ -1772,8 +1987,8 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) return 0; @@ -1787,21 +2002,9 @@ repeat: zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { - struct bio *bio; - - bio = f2fs_grab_bio(inode, blkaddr, 1); - if (IS_ERR(bio)) { - err = PTR_ERR(bio); + err = f2fs_submit_page_read(inode, page, blkaddr); + if (err) goto fail; - } - bio->bi_rw = READ_SYNC; - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_put(bio); - err = -EFAULT; - goto fail; - } - - __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1914,10 +2117,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { - if (err > 0) + if (err > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + err); set_inode_flag(inode, FI_UPDATE_WRITE); - else if (err < 0) + } else if (err < 0) { f2fs_write_failed(mapping, offset + count); + } } if (trace_android_fs_dataread_start_enabled() && @@ -1955,7 +2161,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return; + return drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); @@ -2064,8 +2270,12 @@ int f2fs_migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written && !mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; + if (atomic_written) { + if (mode != MIGRATE_SYNC) + return -EBUSY; + if (!mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + } /* * A reference is expected if PagePrivate set when move mapping, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index cd338ca24941..87f449845f5f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -51,9 +51,26 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + } + if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; + } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -64,6 +81,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->append = sbi->im[APPEND_INO].ino_num; + si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); @@ -78,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -91,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; - si->cursec[i] = curseg->segno / sbi->segs_per_sec; - si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } for (i = 0; i < 2; i++) { @@ -116,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + blks_per_sec = BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; @@ -148,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) if (si->base_mem) goto get_cache; - si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; @@ -185,6 +209,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); + si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; + si->base_mem += NM_I(sbi)->nat_blocks / 8; + si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); get_cache: si->cache_mem = 0; @@ -196,6 +224,11 @@ get_cache: /* build merge flush thread */ if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); + if (SM_I(sbi)->dcc_info) { + si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + @@ -256,8 +289,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Orphan Inode: %u\n", - si->orphans); + seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", + si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -316,10 +349,16 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d)\n", - si->nr_wb_cp_data, si->nr_wb_data); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt); + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + si->nr_wb_cp_data, si->nr_wb_data, + si->nr_flushing, si->nr_flushed, + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd, si->undiscard_blks); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + "volatile IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -332,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", - si->free_nids, si->alloc_nids); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); @@ -419,7 +458,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inplace_count, 0); atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); + atomic_set(&sbi->max_vw_cnt, 0); mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4436079dbf0c..4f2a8fedb313 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; @@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct fscrypt_str de_name = FSTR_INIT(NULL, 0); - struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -130,17 +128,9 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - /* encrypted case */ - de_name.name = d->filename[bit_pos]; - de_name.len = le16_to_cpu(de->name_len); - - /* show encrypted name */ - if (fname->hash) { - if (de->hash_code == cpu_to_le32(fname->hash)) - goto found; - } else if (de_name.len == name->len && - de->hash_code == namehash && - !memcmp(de_name.name, name->name, name->len)) + if (de->hash_code == namehash && + fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) goto found; if (max_slots && max_len > *max_slots) @@ -170,12 +160,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash; - - if(fname->hash) - namehash = cpu_to_le32(fname->hash); - else - namehash = f2fs_dentry_hash(&name); + f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -250,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, break; } out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; return de; } @@ -268,7 +256,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, err = fscrypt_setup_filename(dir, child, 1, &fname); if (err) { - *res_page = ERR_PTR(err); + if (err == -ENOENT) + *res_page = NULL; + else + *res_page = ERR_PTR(err); return NULL; } @@ -330,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name) -{ - struct page *page; - - if (file_enc_name(to)) - return 0; - - page = get_node_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(page)) - return PTR_ERR(page); - - init_dent_inode(name, page); - f2fs_put_page(page, 1); - - return 0; -} - void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { @@ -377,7 +350,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -431,15 +404,19 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (new_name) + if (new_name) { init_dent_inode(new_name, page); + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); + } /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ if (is_inode_flag_set(inode, FI_INC_LINK)) { - file_lost_pino(inode); + if (!S_ISDIR(inode->i_mode)) + file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, * we should remove this inode from orphan list. @@ -535,7 +512,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, level = 0; slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(new_name); + dentry_hash = f2fs_dentry_hash(new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -545,8 +522,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { + f2fs_show_injection_info(FAULT_DIR_DEPTH); return -ENOSPC; + } #endif if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; @@ -590,11 +569,9 @@ add_dentry: err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -643,14 +620,34 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; int err; err = fscrypt_setup_filename(dir, name, 0, &fname); if (err) return err; - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); - + /* + * An immature stakable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &page); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + err = -EEXIST; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + } fscrypt_free_filename(&fname); return err; } @@ -708,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + struct address_space *mapping = page_mapping(page); + unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -721,7 +720,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) - clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, @@ -738,6 +737,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); @@ -882,7 +886,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_blk = kmap(dentry_page); - make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 6ed6424807b6..ff2352a0ed15 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -18,6 +18,179 @@ #include "node.h" #include +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs) +{ + struct rb_node **p = &root->rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) + p = &(*p)->rb_left; + else if (ofs >= re->ofs + re->len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent, + bool force) +{ + struct rb_node **pnode = &root->rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) + pnode = &(*pnode)->rb_left; + else if (ofs >= re->ofs + re->len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs || force) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1 || force) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + + cur = next; + } +#endif + return true; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; @@ -77,7 +250,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) struct extent_tree *et; nid_t ino = inode->i_ino; - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); @@ -94,7 +267,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) atomic_dec(&sbi->total_zombie_tree); list_del_init(&et->list); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); /* never died until evict_inode */ F2FS_I(inode)->extent_tree = et; @@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en = et->cached_en; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { - stat_inc_cached_node_hit(sbi); - return en; - } - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - stat_inc_rbtree_node_hit(sbi); - return en; - } - } - return NULL; -} - static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { @@ -177,7 +320,7 @@ static void __drop_largest_extent(struct inode *inode, } /* return true, if inode page is changed */ -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; @@ -215,6 +358,16 @@ out: return false; } +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + bool ret = __f2fs_init_extent_tree(inode, i_ext); + + if (!F2FS_I(inode)->extent_tree) + set_inode_flag(inode, FI_NO_EXTENT); + + return ret; +} + static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct extent_info *ei) { @@ -237,17 +390,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = __lookup_extent_tree(sbi, et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; - } - spin_unlock(&sbi->extent_lock); - ret = true; + en = (struct extent_node *)__lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi); + else + stat_inc_rbtree_node_hit(sbi); + + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; } + spin_unlock(&sbi->extent_lock); + ret = true; out: stat_inc_total_hit(sbi); read_unlock(&et->lock); @@ -256,83 +416,6 @@ out: return ret; } - -/* - * lookup extent at @fofs, if hit, return the extent - * if not, return NULL and - * @prev_ex: extent before fofs - * @next_ex: extent after fofs - * @insert_p: insert point for new extent at fofs - * in order to simpfy the insertion after. - * tree must stay unchanged between lookup and insertion. - */ -static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, - unsigned int fofs, - struct extent_node **prev_ex, - struct extent_node **next_ex, - struct rb_node ***insert_p, - struct rb_node **insert_parent) -{ - struct rb_node **pnode = &et->root.rb_node; - struct rb_node *parent = NULL, *tmp_node; - struct extent_node *en = et->cached_en; - - *insert_p = NULL; - *insert_parent = NULL; - *prev_ex = NULL; - *next_ex = NULL; - - if (RB_EMPTY_ROOT(&et->root)) - return NULL; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - goto lookup_neighbors; - } - - while (*pnode) { - parent = *pnode; - en = rb_entry(*pnode, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) - pnode = &(*pnode)->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) - pnode = &(*pnode)->rb_right; - else - goto lookup_neighbors; - } - - *insert_p = pnode; - *insert_parent = parent; - - en = rb_entry(parent, struct extent_node, rb_node); - tmp_node = parent; - if (parent && fofs > en->ei.fofs) - tmp_node = rb_next(parent); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - - tmp_node = parent; - if (parent && fofs < en->ei.fofs) - tmp_node = rb_prev(parent); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - return NULL; - -lookup_neighbors: - if (fofs == en->ei.fofs) { - /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&en->rb_node); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - if (fofs == en->ei.fofs + en->ei.len - 1) { - /* lookup next node for merging frontward later */ - tmp_node = rb_next(&en->rb_node); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - return en; -} - static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, @@ -387,17 +470,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) - p = &(*p)->rb_left; - else if (ei->fofs >= en->ei.fofs + en->ei.len) - p = &(*p)->rb_right; - else - f2fs_bug_on(sbi, 1); - } + p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -413,7 +486,7 @@ do_insert: return en; } -static unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static void f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -426,7 +499,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int pos = (unsigned int)fofs; if (!et) - return false; + return; trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); @@ -434,7 +507,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); - return false; + return; } prev = et->largest; @@ -447,8 +520,11 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, - &insert_p, &insert_parent); + en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false); if (!en) en = next_en; @@ -531,8 +607,6 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __free_extent_tree(sbi, et); write_unlock(&et->lock); - - return !__is_extent_same(&prev, &et->largest); } unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) @@ -548,7 +622,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!atomic_read(&sbi->total_zombie_tree)) goto free_node; - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ @@ -570,11 +644,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) goto unlock_out; cond_resched(); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); @@ -604,7 +678,7 @@ free_node: spin_unlock(&sbi->extent_lock); unlock_out: - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); out: trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); @@ -651,10 +725,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); list_add_tail(&et->list, &sbi->zombie_list); atomic_inc(&sbi->total_zombie_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); return; } @@ -662,12 +736,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) node_cnt = f2fs_destroy_extent_node(inode); /* delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); atomic_dec(&sbi->total_ext_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -714,7 +788,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, void init_extent_cache_info(struct f2fs_sb_info *sbi) { INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - init_rwsem(&sbi->extent_tree_lock); + mutex_init(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); atomic_set(&sbi->total_ext_tree, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 167c5f841b5f..ff694127243a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,7 +22,12 @@ #include #include #include -#include +#include +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#include +#else +#include +#endif #include #include @@ -47,6 +52,7 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, FAULT_MAX, @@ -59,7 +65,7 @@ struct f2fs_fault_info { }; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif /* @@ -84,10 +90,14 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 +#define F2FS_MOUNT_USRQUOTA 0x00080000 +#define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 -#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) +#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -103,15 +113,19 @@ struct f2fs_mount_info { unsigned int opt; }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_SET_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)) #define F2FS_CLEAR_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) /* bio stuffs */ #define REQ_OP_READ READ @@ -187,19 +201,22 @@ enum { SIT_BITMAP }; -enum { - CP_UMOUNT, - CP_FASTBOOT, - CP_SYNC, - CP_RECOVERY, - CP_DISCARD, -}; +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 -#define DEF_BATCHED_TRIM_SECTIONS 2 +#define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) +#define DISCARD_ISSUE_RATE 8 +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -241,19 +258,73 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; -/* for the list of blockaddresses to be discarded */ +/* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ - block_t blkaddr; /* block address to be discarded */ - int len; /* # of consecutive blocks of the discard */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ +}; + +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : (blk_num - 1)) + +#define P_ACTIVE 0x01 +#define P_TRIM 0x02 +#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) + +enum { + D_PREP, + D_SUBMIT, + D_DONE, +}; + +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ }; struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; struct list_head list; /* command list */ struct completion wait; /* compleation */ - block_t lstart; /* logical start address */ - block_t len; /* length */ - struct bio *bio; /* bio */ + struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ + unsigned char state; /* state */ + int error; /* bio error */ +}; + +struct discard_cmd_control { + struct task_struct *f2fs_issue_discard; /* discard thread */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ + struct list_head wait_list; /* store on-flushing entries */ + wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ + struct mutex cmd_lock; + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ + unsigned int discard_granularity; /* discard granularity */ + unsigned int undiscard_blks; /* # of undiscard blocks */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t issing_discard; /* # of issing discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root root; /* root of discard rb-tree */ }; /* for the list of fsync inodes, used only during recovery */ @@ -264,13 +335,13 @@ struct fsync_inode_entry { block_t last_dentry; /* block address locating the last dentry */ }; -#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) -#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) #define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) @@ -278,6 +349,7 @@ struct fsync_inode_entry { static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } @@ -285,6 +357,7 @@ static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } @@ -310,11 +383,17 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) +#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ + struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -339,6 +418,12 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_gc_range { + u32 sync; + u64 start; + u64 len; +}; + struct f2fs_defragment { u64 start; u64 len; @@ -351,36 +436,68 @@ struct f2fs_move_range { u64 len; /* size to move */ }; +struct f2fs_flush_device { + u32 dev_num; /* device number to flush */ + u32 segments; /* # of segments to flush */ +}; + +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 +static inline int get_extra_isize(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + DEF_INLINE_RESERVED_SIZE - \ + F2FS_INLINE_XATTR_ADDRS)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \ + BITS_PER_BYTE - 1) / BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + /* * For INODE and NODE manager */ /* for directory operations */ struct f2fs_dentry_ptr { struct inode *inode; - const void *bitmap; + void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; + int nr_bitmap; }; -static inline void make_dentry_ptr(struct inode *inode, - struct f2fs_dentry_ptr *d, void *src, int type) +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} - if (type == 1) { - struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; - d->max = NR_DENTRY_IN_BLOCK; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } else { - struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, void *t) +{ + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + + d->inode = inode; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; } /* @@ -412,16 +529,30 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + struct extent_info { unsigned int fofs; /* start offset in a file */ - u32 blk; /* start block address of the extent */ unsigned int len; /* length of the extent */ + u32 blk; /* start block address of the extent */ }; struct extent_node { - struct rb_node rb_node; /* rb node located in rb-tree */ + struct rb_node rb_node; + union { + struct { + unsigned int fofs; + unsigned int len; + u32 blk; + }; + struct extent_info ei; /* extent info */ + + }; struct list_head list; /* node in global extent list of sbi */ - struct extent_info ei; /* extent info */ struct extent_tree *et; /* extent tree pointer */ }; @@ -455,12 +586,13 @@ struct f2fs_map_blocks { }; /* for flag in get_data_block */ -#define F2FS_GET_BLOCK_READ 0 -#define F2FS_GET_BLOCK_DIO 1 -#define F2FS_GET_BLOCK_FIEMAP 2 -#define F2FS_GET_BLOCK_BMAP 3 -#define F2FS_GET_BLOCK_PRE_DIO 4 -#define F2FS_GET_BLOCK_PRE_AIO 5 +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, +}; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -502,16 +634,29 @@ struct f2fs_inode_info { atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ - unsigned long long xattr_ver; /* cp version of xattr modification */ loff_t last_disk_size; /* lastly written file size */ +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ + struct rw_semaphore i_mmap_sem; + struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + + int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ }; static inline void get_extent_info(struct extent_info *ext, @@ -538,11 +683,22 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } -static inline bool __is_extent_same(struct extent_info *ei1, - struct extent_info *ei2) +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front) { - return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk && - ei1->len == ei2->len); + return back->lstart + back->len == front->lstart; +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back) +{ + return __is_discard_mergeable(back, cur); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front) +{ + return __is_discard_mergeable(cur, front); } static inline bool __is_extent_mergeable(struct extent_info *back, @@ -564,7 +720,7 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { @@ -596,6 +752,7 @@ struct f2fs_nm_info { struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ @@ -603,9 +760,17 @@ struct f2fs_nm_info { unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ + unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char *nat_block_bitmap; + unsigned short *free_nid_count; /* free nid count of NAT block */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ + + unsigned int nat_bits_blocks; /* # of nat bits blocks */ + unsigned char *nat_bits; /* NAT bits blocks */ + unsigned char *full_nat_bits; /* full NAT pages */ + unsigned char *empty_nat_bits; /* empty NAT pages */ #ifdef CONFIG_F2FS_CHECK_FS char *nat_bitmap_mir; /* NAT bitmap mirror */ #endif @@ -676,7 +841,8 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - atomic_t submit_flush; /* # of issued flushes */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t issing_flush; /* # of issing flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -699,12 +865,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for small discard management */ - struct list_head discard_entry_list; /* 4KB discard entry list */ - struct list_head discard_cmd_list; /* discard cmd list */ - int nr_discards; /* # of discards in the list */ - int max_discards; /* max. discards to be issued */ - /* for batched trimming */ unsigned int trim_sections; /* # of sections to trim */ @@ -713,9 +873,13 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; + + /* for discard command control */ + struct discard_cmd_control *dcc_info; }; /* @@ -760,29 +924,68 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_INVALIDATE, INMEM_REVOKE, IPU, OPU, }; +enum temp_type { + HOT = 0, /* must be zero for meta bio */ + WARM, + COLD, + NR_TEMP_TYPE, +}; + +enum need_lock_type { + LOCK_REQ = 0, + LOCK_DONE, + LOCK_RETRY, +}; + +enum iostat_type { + APP_DIRECT_IO, /* app direct IOs */ + APP_BUFFERED_IO, /* app buffered IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + struct list_head list; /* serialize IOs */ + bool submitted; /* indicate IO submission */ + int need_lock; /* indicate we need to lock cp_rwsem */ + bool in_list; /* indicate fio is in io_list */ + enum iostat_type io_type; /* io type */ }; -#define is_read_io(rw) (rw == READ) +#define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ struct rw_semaphore io_rwsem; /* blocking op for bio */ + spinlock_t io_lock; /* serialize DATA/NODE IOs */ + struct list_head io_list; /* track fios */ }; #define FDEV(i) (sbi->devs[i]) @@ -830,10 +1033,6 @@ enum { MAX_TIME, }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 -#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -841,11 +1040,6 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ -#ifdef CONFIG_F2FS_FS_ENCRYPTION - u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; - u8 key_prefix_size; -#endif - #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ @@ -859,9 +1053,9 @@ struct f2fs_sb_info { struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ - struct f2fs_bio_info read_io; /* for read bios */ - struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ - struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; + /* bio ordering for NODE/DATA */ int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ @@ -873,6 +1067,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct rw_semaphore node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -888,7 +1083,7 @@ struct f2fs_sb_info { /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ + struct mutex extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ atomic_t total_ext_tree; /* extent tree count */ @@ -918,6 +1113,8 @@ struct f2fs_sb_info { block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ + block_t reserved_blocks; /* configurable reserved blocks */ + u32 s_next_generation; /* for NFS support */ /* # of pages, see count_type */ @@ -925,6 +1122,9 @@ struct f2fs_sb_info { /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* writeback control */ + atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + /* valid inode count */ struct percpu_counter total_valid_inode_count; @@ -935,6 +1135,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* threshold for converting bg victims for fg */ + u64 fggc_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -955,13 +1158,19 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t aw_cnt; /* # of atomic writes */ + atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ + atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif - unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long write_iostat[NR_IO_TYPE]; + bool iostat_enable; + /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; @@ -980,13 +1189,26 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + /* For fault injection */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; #endif + +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION +#define f2fs_show_injection_info(type) \ + printk("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, fault_name[type], \ + __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { struct f2fs_fault_info *ffi = &sbi->fault_info; @@ -1000,10 +1222,6 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); - printk("%sF2FS-fs : inject %s in %pF\n", - KERN_INFO, - fault_name[type], - __builtin_return_address(0)); return true; } return false; @@ -1014,8 +1232,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ - s->sectors_written_start) >> 1) +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ + (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { @@ -1068,6 +1286,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, return f2fs_crc32(sbi, buf, buf_size) == blk_crc; } +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -1168,6 +1407,12 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) +{ + size_t crc_offset = le32_to_cpu(cp->checksum_offset); + return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); +} + static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); @@ -1191,9 +1436,11 @@ static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __set_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) @@ -1207,9 +1454,34 @@ static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + unsigned long flags; + + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + kfree(NM_I(sbi)->nat_bits); + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) @@ -1217,6 +1489,11 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) down_read(&sbi->cp_rwsem); } +static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) +{ + return down_read_trylock(&sbi->cp_rwsem); +} + static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { up_read(&sbi->cp_rwsem); @@ -1245,7 +1522,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi) static inline bool __remain_node_summaries(int reason) { - return (reason == CP_UMOUNT || reason == CP_FASTBOOT); + return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) @@ -1266,17 +1543,14 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) return 0; } -#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 - /* * Check whether the inode has blocks or not */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { - if (F2FS_I(inode)->i_xattr_nid) - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; - else - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; + block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; + + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; } static inline bool f2fs_has_xattr_block(unsigned int ofs) @@ -1284,15 +1558,24 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); -static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); +static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { - blkcnt_t diff; + blkcnt_t diff = 0, release = 0; + block_t avail_user_block_count; + int ret; + + ret = dquot_reserve_block(inode, *count); + if (ret) + return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_BLOCK)) - return false; + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); + release = *count; + goto enospc; + } #endif /* * let's increase this in prior to actual block count change in order @@ -1302,32 +1585,42 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { - diff = sbi->total_valid_block_count - sbi->user_block_count; + avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; - sbi->total_valid_block_count = sbi->user_block_count; + release = diff; + sbi->total_valid_block_count = avail_user_block_count; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); - return false; + goto enospc; } } spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, *count, true); - return true; + if (release) + dquot_release_reservation_block(inode, release); + f2fs_i_blocks_write(inode, *count, true, true); + return 0; + +enospc: + dquot_release_reservation_block(inode, release); + return -ENOSPC; } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, - blkcnt_t count) + block_t count) { + blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; + spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - f2fs_bug_on(sbi, inode->i_blocks < count); + f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, count, false); + f2fs_i_blocks_write(inode, count, false, true); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1456,51 +1749,70 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } -static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) +static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count; + bool quota = inode && !is_inode; + + if (quota) { + int ret = dquot_reserve_block(inode, 1); + if (ret) + return ret; + } spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count > sbi->user_block_count)) { + if (unlikely(valid_block_count + sbi->reserved_blocks > + sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } valid_node_count = sbi->total_valid_node_count + 1; if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } - if (inode) - f2fs_i_blocks_write(inode, 1, true); - sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true, true); + } + percpu_counter_inc(&sbi->alloc_valid_block_count); - return true; + return 0; + +enospc: + if (quota) + dquot_release_reservation_block(inode, 1); + return -ENOSPC; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool is_inode) { spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, !sbi->total_valid_block_count); f2fs_bug_on(sbi, !sbi->total_valid_node_count); - f2fs_bug_on(sbi, !inode->i_blocks); + f2fs_bug_on(sbi, !is_inode && !inode->i_blocks); - f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); + + if (!is_inode) + f2fs_i_blocks_write(inode, 1, false, true); } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) @@ -1528,11 +1840,14 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, { #ifdef CONFIG_F2FS_FAULT_INJECTION struct page *page = find_lock_page(mapping, index); + if (page) return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); return NULL; + } #endif if (!for_write) return grab_cache_page(mapping, index); @@ -1611,22 +1926,42 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, static inline bool IS_INODE(struct page *page) { struct f2fs_node *p = F2FS_NODE(page); + return RAW_IS_INODE(p); } +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } -static inline block_t datablock_addr(struct page *node_page, - unsigned int offset) +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t datablock_addr(struct inode *inode, + struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); + raw_node = F2FS_NODE(node_page); + + /* from GC path only */ + if (!inode) { + if (is_inode) + base = offset_in_addr(&raw_node->i); + } else if (f2fs_has_extra_attr(inode) && is_inode) { + base = get_extra_isize(inode); + } + addr_array = blkaddr_in_node(raw_node); - return le32_to_cpu(addr_array[offset]); + return le32_to_cpu(addr_array[base + offset]); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -1689,6 +2024,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -1715,6 +2064,10 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1768,13 +2121,21 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) } static inline void f2fs_i_blocks_write(struct inode *inode, - blkcnt_t diff, bool add) + block_t diff, bool add, bool claim) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); - inode->i_blocks = add ? inode->i_blocks + diff : - inode->i_blocks - diff; + /* add = 1, claim = 1 should be dquot_reserve_block in pair */ + if (add) { + if (claim) + dquot_claim_block(inode, diff); + else + dquot_alloc_block_nofail(inode, diff); + } else { + dquot_free_block(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); @@ -1826,6 +2187,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) set_bit(FI_INLINE_DOTS, &fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -1842,6 +2205,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -1852,13 +2222,14 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) - return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; - return DEF_ADDRS_PER_INODE; + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; + return CUR_ADDRS_PER_INODE(inode); } static inline void *inline_xattr_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } @@ -1876,12 +2247,6 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DATA); } -static inline void f2fs_clear_inline_inode(struct inode *inode) -{ - clear_inode_flag(inode, FI_INLINE_DATA); - clear_inode_flag(inode, FI_DATA_EXIST); -} - static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); @@ -1917,10 +2282,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) return is_inode_flag_set(inode, FI_DROP_CACHE); } -static inline void *inline_data_addr(struct page *page) +static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); - return (void *)&(ri->i_addr[1]); + int extra_size = get_extra_isize(inode); + + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -2003,13 +2370,15 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_KMALLOC)) + if (time_to_inject(sbi, FAULT_KMALLOC)) { + f2fs_show_injection_info(FAULT_KMALLOC); return NULL; + } #endif return kmalloc(size, flags); } -static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) +static inline void *kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -2019,7 +2388,7 @@ static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) return ret; } -static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) +static inline void *kvzalloc(size_t size, gfp_t flags) { void *ret; @@ -2029,42 +2398,79 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) return ret; } +static inline int get_extra_isize(struct inode *inode) +{ + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) -/* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ - ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ - (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*f2fs_inode), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + +static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) + sbi->write_iostat[i] = 0; + spin_unlock(&sbi->iostat_lock); +} + +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + spin_lock(&sbi->iostat_lock); + sbi->write_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->write_iostat[APP_BUFFERED_IO] = + sbi->write_iostat[APP_WRITE_IO] - + sbi->write_iostat[APP_DIRECT_IO]; + spin_unlock(&sbi->iostat_lock); +} /* * file.c */ -int f2fs_sync_file(struct file *, loff_t, loff_t, int); -void truncate_data_blocks(struct dnode_of_data *); -int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *); -int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -int f2fs_setattr(struct dentry *, struct iattr *); -int truncate_hole(struct inode *, pgoff_t, pgoff_t); -int truncate_data_blocks_range(struct dnode_of_data *, int); -long f2fs_ioctl(struct file *, unsigned int, unsigned long); -long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void truncate_data_blocks(struct dnode_of_data *dn); +int truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate(struct inode *inode); +int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +int truncate_data_blocks_range(struct dnode_of_data *dn, int count); +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* * inode.c */ -void f2fs_set_inode_flags(struct inode *); -struct inode *f2fs_iget(struct super_block *, unsigned long); -struct inode *f2fs_iget_retry(struct super_block *, unsigned long); -int try_to_free_nats(struct f2fs_sb_info *, int); -int update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); -int f2fs_write_inode(struct inode *, struct writeback_control *); -void f2fs_evict_inode(struct inode *); -void handle_failed_inode(struct inode *); +void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +int update_inode(struct inode *inode, struct page *node_page); +int update_inode_page(struct inode *inode); +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_evict_inode(struct inode *inode); +void handle_failed_inode(struct inode *inode); /* * namei.c @@ -2074,40 +2480,45 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -void set_de_type(struct f2fs_dir_entry *, umode_t); -unsigned char get_de_type(struct f2fs_dir_entry *); -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, - f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct fscrypt_str *); -void do_make_empty_dir(struct inode *, struct inode *, - struct f2fs_dentry_ptr *); -struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, const struct qstr *, struct page *); -void update_parent_metadata(struct inode *, struct inode *, unsigned int); -int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *); -struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, - struct page **); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, - struct page **); -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); -void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, - struct page *, struct inode *); -int update_dent_inode(struct inode *, struct inode *, const struct qstr *); -void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, - const struct qstr *, f2fs_hash_t , unsigned int); -int f2fs_add_regular_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, - nid_t, umode_t); -int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, - umode_t); -void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, - struct inode *); -int f2fs_do_tmpfile(struct inode *, struct inode *); -bool f2fs_empty_dir(struct inode *); +void set_de_type(struct f2fs_dir_entry *de, umode_t mode); +unsigned char get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d); +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr); +void do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d); +struct page *init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *new_name, + const struct qstr *orig_name, struct page *dpage); +void update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth); +int room_for_filename(const void *bitmap, int slots, int max_slots); +void f2fs_drop_nlink(struct inode *dir, struct inode *inode); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page); +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos); +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir); +bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { @@ -2118,18 +2529,21 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *, bool); -void f2fs_inode_synced(struct inode *); -int f2fs_commit_super(struct f2fs_sb_info *, bool); -int f2fs_sync_fs(struct super_block *, int); +int f2fs_inode_dirtied(struct inode *inode, bool sync); +void f2fs_inode_synced(struct inode *inode); +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +void f2fs_quota_off_umount(struct super_block *sb); +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); +int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) -void f2fs_msg(struct super_block *, const char *, const char *, ...); +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname); /* * node.c @@ -2137,164 +2551,190 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *); struct dnode_of_data; struct node_info; -bool available_free_memory(struct f2fs_sb_info *, int); -int need_dentry_mark(struct f2fs_sb_info *, nid_t); -bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool need_inode_block_update(struct f2fs_sb_info *, nid_t); -void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); -pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); -int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); -int truncate_inode_blocks(struct inode *, pgoff_t); -int truncate_xattr_node(struct inode *, struct page *); -int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -int remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *); -struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); -void ra_node_page(struct f2fs_sb_info *, nid_t); -struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_node_page_ra(struct page *, int); -void move_node_page(struct page *, int); -int fsync_node_pages(struct f2fs_sb_info *, struct inode *, - struct writeback_control *, bool); -int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *, bool); -bool alloc_nid(struct f2fs_sb_info *, nid_t *); -void alloc_nid_done(struct f2fs_sb_info *, nid_t); -void alloc_nid_failed(struct f2fs_sb_info *, nid_t); -int try_to_free_nids(struct f2fs_sb_info *, int); -void recover_inline_xattr(struct inode *, struct page *); -void recover_xattr_data(struct inode *, struct page *, block_t); -int recover_inode_page(struct f2fs_sb_info *, struct page *); -int restore_node_summary(struct f2fs_sb_info *, unsigned int, - struct f2fs_summary_block *); -void flush_nat_entries(struct f2fs_sb_info *); -int build_node_manager(struct f2fs_sb_info *); -void destroy_node_manager(struct f2fs_sb_info *); +bool available_free_memory(struct f2fs_sb_info *sbi, int type); +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int truncate_inode_blocks(struct inode *inode, pgoff_t from); +int truncate_xattr_node(struct inode *inode, struct page *page); +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int remove_inode_page(struct inode *inode); +struct page *new_inode_page(struct inode *inode); +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); +void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *get_node_page_ra(struct page *parent, int start); +void move_node_page(struct page *node_page, int gc_type); +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type); +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +void recover_inline_xattr(struct inode *inode, struct page *page); +int recover_xattr_data(struct inode *inode, struct page *page, + block_t blkaddr); +int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum); +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_node_manager(struct f2fs_sb_info *sbi); +void destroy_node_manager(struct f2fs_sb_info *sbi); int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* * segment.c */ -void register_inmem_page(struct inode *, struct page *); -void drop_inmem_pages(struct inode *); -int commit_inmem_pages(struct inode *); -void f2fs_balance_fs(struct f2fs_sb_info *, bool); -void f2fs_balance_fs_bg(struct f2fs_sb_info *); -int f2fs_issue_flush(struct f2fs_sb_info *); -int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); -void invalidate_blocks(struct f2fs_sb_info *, block_t); -bool is_checkpointed_data(struct f2fs_sb_info *, block_t); -void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t); -void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); -void release_discard_addrs(struct f2fs_sb_info *); -int npages_for_summary_flush(struct f2fs_sb_info *, bool); -void allocate_new_segments(struct f2fs_sb_info *); -int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); -bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *); -struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -void update_meta_page(struct f2fs_sb_info *, void *, block_t); -void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(unsigned int, struct f2fs_io_info *); -void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); -void rewrite_data_page(struct f2fs_io_info *); -void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, - block_t, block_t, bool, bool); -void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool, bool); -void allocate_data_block(struct f2fs_sb_info *, struct page *, - block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); -void write_data_summaries(struct f2fs_sb_info *, block_t); -void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); -int build_segment_manager(struct f2fs_sb_info *); -void destroy_segment_manager(struct f2fs_sb_info *); +bool need_SSR(struct f2fs_sb_info *sbi); +void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages(struct inode *inode); +void drop_inmem_page(struct inode *inode, struct page *page); +int commit_inmem_pages(struct inode *inode); +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int create_flush_cmd_control(struct f2fs_sb_info *sbi); +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void stop_discard_thread(struct f2fs_sb_info *sbi); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void release_discard_addrs(struct f2fs_sb_info *sbi); +int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); +struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); +void write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); +int rewrite_data_page(struct f2fs_io_info *fio); +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr); +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr); +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list); +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered); +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); +void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc); +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_segment_manager(struct f2fs_sb_info *sbi); +void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); -struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); -bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); -void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); -long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void release_ino_entry(struct f2fs_sb_info *, bool); -bool exist_written_data(struct f2fs_sb_info *, nid_t, int); -int f2fs_sync_inode_meta(struct f2fs_sb_info *); -int acquire_orphan_inode(struct f2fs_sb_info *); -void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct inode *); -void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); -int get_valid_checkpoint(struct f2fs_sb_info *); -void update_dirty_page(struct inode *, struct page *); -void remove_dirty_inode(struct inode *); -int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); -int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); -void init_ino_entry_info(struct f2fs_sb_info *); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); +void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write, enum iostat_type io_type); +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); +int acquire_orphan_inode(struct f2fs_sb_info *sbi); +void release_orphan_inode(struct f2fs_sb_info *sbi); +void add_orphan_inode(struct inode *inode); +void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int recover_orphan_inodes(struct f2fs_sb_info *sbi); +int get_valid_checkpoint(struct f2fs_sb_info *sbi); +void update_dirty_page(struct inode *inode, struct page *page); +void remove_dirty_inode(struct inode *inode); +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void init_ino_entry_info(struct f2fs_sb_info *sbi); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* * data.c */ -void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, - struct page *, nid_t, enum page_type, int); -void f2fs_flush_merged_bios(struct f2fs_sb_info *); -int f2fs_submit_page_bio(struct f2fs_io_info *); -int f2fs_submit_page_mbio(struct f2fs_io_info *); -struct block_device *f2fs_target_device(struct f2fs_sb_info *, - block_t, struct bio *); -int f2fs_target_device_index(struct f2fs_sb_info *, block_t); -void set_data_blkaddr(struct dnode_of_data *); -void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); -int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); -int reserve_new_block(struct dnode_of_data *); -int f2fs_get_block(struct dnode_of_data *, pgoff_t); -int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); -int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); -struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t, bool); -struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int do_write_data_page(struct f2fs_io_info *); -int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); -int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); -void f2fs_set_page_dirty_nobuffers(struct page *); -void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); -int f2fs_release_page(struct page *, gfp_t); +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type); +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); +int f2fs_submit_page_bio(struct f2fs_io_info *fio); +int f2fs_submit_page_write(struct f2fs_io_info *fio); +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio); +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); +void set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int reserve_new_block(struct dnode_of_data *dn); +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int op_flags, bool for_write); +struct page *find_data_page(struct inode *inode, pgoff_t index); +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write); +struct page *get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size); +int do_write_data_page(struct f2fs_io_info *fio); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +void f2fs_set_page_dirty_nobuffers(struct page *page); +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type); +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length); +int f2fs_release_page(struct page *page, gfp_t wait); #ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *, struct page *, struct page *, - enum migrate_mode); +int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode); #endif /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *); -void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool, bool); -void build_gc_manager(struct f2fs_sb_info *); +int start_gc_thread(struct f2fs_sb_info *sbi); +void stop_gc_thread(struct f2fs_sb_info *sbi); +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, + unsigned int segno); +void build_gc_manager(struct f2fs_sb_info *sbi); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *, bool); -bool space_for_roll_forward(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool space_for_roll_forward(struct f2fs_sb_info *sbi); /* * debug.c @@ -2311,11 +2751,15 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; - int inline_xattr, inline_inode, inline_dir, orphans; - int aw_cnt, max_aw_cnt; + int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_discard_cmd; + unsigned int undiscard_blks; + int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2388,9 +2832,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) #define stat_inc_atomic_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)); + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) #define stat_dec_atomic_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)); + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) #define stat_update_max_atomic_write(inode) \ do { \ int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ @@ -2398,11 +2842,22 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) +#define stat_inc_volatile_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_dec_volatile_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_update_max_volatile_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) { \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ si->data_segs++; \ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ } else { \ @@ -2412,14 +2867,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } while (0) #define stat_inc_tot_blk_count(si, blks) \ - (si->tot_blks += (blks)) + ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ - si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ @@ -2427,40 +2882,43 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ - si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) -int f2fs_build_stats(struct f2fs_sb_info *); -void f2fs_destroy_stats(struct f2fs_sb_info *); +int f2fs_build_stats(struct f2fs_sb_info *sbi); +void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else -#define stat_inc_cp_count(si) -#define stat_inc_bg_cp_count(si) -#define stat_inc_call_count(si) -#define stat_inc_bggc_count(si) -#define stat_inc_dirty_inode(sbi, type) -#define stat_dec_dirty_inode(sbi, type) -#define stat_inc_total_hit(sb) -#define stat_inc_rbtree_node_hit(sb) -#define stat_inc_largest_node_hit(sbi) -#define stat_inc_cached_node_hit(sbi) -#define stat_inc_inline_xattr(inode) -#define stat_dec_inline_xattr(inode) -#define stat_inc_inline_inode(inode) -#define stat_dec_inline_inode(inode) -#define stat_inc_inline_dir(inode) -#define stat_dec_inline_dir(inode) -#define stat_inc_atomic_write(inode) -#define stat_dec_atomic_write(inode) -#define stat_update_max_atomic_write(inode) -#define stat_inc_seg_type(sbi, curseg) -#define stat_inc_block_count(sbi, curseg) -#define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(sbi, type, gc_type) -#define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(sbi, blks, gc_type) -#define stat_inc_node_blk_count(sbi, blks, gc_type) +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sb) do { } while (0) +#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_atomic_write(inode) do { } while (0) +#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_volatile_write(inode) do { } while (0) +#define stat_dec_volatile_write(inode) do { } while (0) +#define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } @@ -2483,52 +2941,78 @@ extern struct kmem_cache *inode_entry_slab; /* * inline.c */ -bool f2fs_may_inline_data(struct inode *); -bool f2fs_may_inline_dentry(struct inode *); -void read_inline_data(struct page *, struct page *); -bool truncate_inline_inode(struct page *, u64); -int f2fs_read_inline_data(struct inode *, struct page *); -int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); -int f2fs_convert_inline_inode(struct inode *); -int f2fs_write_inline_data(struct inode *, struct page *); -bool recover_inline_data(struct inode *, struct page *); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct fscrypt_name *, struct page **); -int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, - struct inode *, struct inode *); -bool f2fs_empty_inline_dir(struct inode *); -int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct fscrypt_str *); -int f2fs_inline_data_fiemap(struct inode *, - struct fiemap_extent_info *, __u64, __u64); +bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_may_inline_dentry(struct inode *inode); +void read_inline_data(struct page *page, struct page *ipage); +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); +int f2fs_read_inline_data(struct inode *inode, struct page *page); +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); +int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_write_inline_data(struct inode *inode, struct page *page); +bool recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +int make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage); +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +bool f2fs_empty_inline_dir(struct inode *dir); +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr); +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); /* * shrinker.c */ -unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *); -unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); -void f2fs_join_shrinker(struct f2fs_sb_info *); -void f2fs_leave_shrinker(struct f2fs_sb_info *); +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +void f2fs_join_shrinker(struct f2fs_sb_info *sbi); +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); -void f2fs_drop_extent_tree(struct inode *); -unsigned int f2fs_destroy_extent_node(struct inode *); -void f2fs_destroy_extent_tree(struct inode *); -bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); -void f2fs_update_extent_cache(struct dnode_of_data *); +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs); +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force); +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root); +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); +void f2fs_drop_extent_tree(struct inode *inode); +unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_tree(struct inode *inode); +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_extent_cache(struct dnode_of_data *dn); void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t, block_t, unsigned int); -void init_extent_cache_info(struct f2fs_sb_info *); + pgoff_t fofs, block_t blkaddr, unsigned int len); +void init_extent_cache_info(struct f2fs_sb_info *sbi); int __init create_extent_cache(void); void destroy_extent_cache(void); +/* + * sysfs.c + */ +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); + /* * crypto support */ @@ -2537,6 +3021,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode) return file_is_encrypt(inode); } +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); +} + static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION @@ -2559,6 +3048,21 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline int f2fs_sb_has_extra_attr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); +} + +static inline int f2fs_sb_has_project_quota(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); +} + +static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) @@ -2606,28 +3110,4 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -#ifndef CONFIG_F2FS_FS_ENCRYPTION -#define fscrypt_set_d_op(i) -#define fscrypt_get_ctx fscrypt_notsupp_get_ctx -#define fscrypt_release_ctx fscrypt_notsupp_release_ctx -#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page -#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page -#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages -#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page -#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page -#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy -#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context -#define fscrypt_inherit_context fscrypt_notsupp_inherit_context -#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info -#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info -#define fscrypt_setup_filename fscrypt_notsupp_setup_filename -#define fscrypt_free_filename fscrypt_notsupp_free_filename -#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size -#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer -#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer -#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr -#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk -#endif #endif diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h deleted file mode 100644 index f113f1a1e8c1..000000000000 --- a/fs/f2fs/f2fs_crypto.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * linux/fs/f2fs/f2fs_crypto.h - * - * Copied from linux/fs/ext4/ext4_crypto.h - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption header content for f2fs - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ -#ifndef _F2FS_CRYPTO_H -#define _F2FS_CRYPTO_H - -#include - -#define F2FS_KEY_DESCRIPTOR_SIZE 8 - -/* Policy provided via an ioctl on the topmost directory */ -struct f2fs_encryption_policy { - char version; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; -} __attribute__((__packed__)); - -#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 -#define F2FS_KEY_DERIVATION_NONCE_SIZE 16 - -#define F2FS_POLICY_FLAGS_PAD_4 0x00 -#define F2FS_POLICY_FLAGS_PAD_8 0x01 -#define F2FS_POLICY_FLAGS_PAD_16 0x02 -#define F2FS_POLICY_FLAGS_PAD_32 0x03 -#define F2FS_POLICY_FLAGS_PAD_MASK 0x03 -#define F2FS_POLICY_FLAGS_VALID 0x03 - -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct f2fs_encryption_context { - char format; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; - char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE]; -} __attribute__((__packed__)); - -/* Encryption parameters */ -#define F2FS_XTS_TWEAK_SIZE 16 -#define F2FS_AES_128_ECB_KEY_SIZE 16 -#define F2FS_AES_256_GCM_KEY_SIZE 32 -#define F2FS_AES_256_CBC_KEY_SIZE 32 -#define F2FS_AES_256_CTS_KEY_SIZE 32 -#define F2FS_AES_256_XTS_KEY_SIZE 64 -#define F2FS_MAX_KEY_SIZE 64 - -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 - -struct f2fs_encryption_key { - __u32 mode; - char raw[F2FS_MAX_KEY_SIZE]; - __u32 size; -} __attribute__((__packed__)); - -struct f2fs_crypt_info { - char ci_data_mode; - char ci_filename_mode; - char ci_flags; - struct crypto_ablkcipher *ci_ctfm; - char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE]; -}; - -#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define F2FS_WRITE_PATH_FL 0x00000002 - -struct f2fs_crypto_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - char flags; /* Flags */ -}; - -struct f2fs_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \ - struct f2fs_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } - -static inline int f2fs_encryption_key_size(int mode) -{ - switch (mode) { - case F2FS_ENCRYPTION_MODE_AES_256_XTS: - return F2FS_AES_256_XTS_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_GCM: - return F2FS_AES_256_GCM_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_CBC: - return F2FS_AES_256_CBC_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_CTS: - return F2FS_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - -#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4 -#define F2FS_CRYPTO_BLOCK_SIZE 16 -#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32 - -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct f2fs_encrypted_symlink_data { - __le16 len; - char encrypted_path[1]; -} __attribute__((__packed__)); - -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 encrypted_symlink_data_len(u32 l) -{ - return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1); -} -#endif /* _F2FS_CRYPTO_H */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e4e5d76d80b0..531379f513fa 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,19 @@ #include "trace.h" #include +static int f2fs_filemap_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&F2FS_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&F2FS_I(inode)->i_mmap_sem); + + return err; +} + static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -60,13 +74,14 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_balance_fs(sbi, dn.node_changed); file_update_time(vma->vm_file); + down_read(&F2FS_I(inode)->i_mmap_sem); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || !PageUptodate(page))) { unlock_page(page); err = -EFAULT; - goto out; + goto out_sem; } /* @@ -86,15 +101,19 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (!PageUptodate(page)) SetPageUptodate(page); + f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); +out_sem: + up_read(&F2FS_I(inode)->i_mmap_sem); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); @@ -102,7 +121,7 @@ out: } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, + .fault = f2fs_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, }; @@ -117,11 +136,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - if (update_dent_inode(inode, inode, &dentry->d_name)) { - dput(dentry); - return 0; - } - *pino = parent_ino(dentry); dput(dentry); return 1; @@ -142,8 +156,6 @@ static inline bool need_do_checkpoint(struct inode *inode) need_cp = true; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; - else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) - need_cp = true; else if (test_opt(sbi, FASTBOOT)) need_cp = true; else if (sbi->active_logs == 2) @@ -169,7 +181,6 @@ static void try_to_fix_pino(struct inode *inode) nid_t pino; down_write(&fi->i_sem); - fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); @@ -268,9 +279,19 @@ sync_nodes: goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, ino); - if (ret) - goto out; + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + } /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); @@ -278,7 +299,8 @@ sync_nodes: flush_out: remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); - ret = f2fs_issue_flush(sbi); + if (!atomic) + ret = f2fs_issue_flush(sbi); f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -375,7 +397,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -423,14 +446,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; - if (f2fs_encrypted_inode(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!f2fs_encrypted_inode(inode)) - return -ENOKEY; - } - /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -443,11 +458,10 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { - int ret = generic_file_open(inode, filp); struct dentry *dir; - if (!ret && f2fs_encrypted_inode(inode)) { - ret = fscrypt_get_encryption_info(inode); + if (f2fs_encrypted_inode(inode)) { + int ret = fscrypt_get_encryption_info(inode); if (ret) return -EACCES; if (!fscrypt_has_encryption_key(inode)) @@ -460,7 +474,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return -EPERM; } dput(dir); - return ret; + return dquot_file_open(inode, filp); } int truncate_data_blocks_range(struct dnode_of_data *dn, int count) @@ -469,9 +483,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + ofs; + addr = blkaddr_in_node(raw_node) + base + ofs; for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); @@ -531,12 +549,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - return 0; + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || - !S_ISREG(inode->i_mode)) + + /* An encrypted inode should have a key and truncate the last page. */ + f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode)); + if (!cache_only) set_page_dirty(page); f2fs_put_page(page, 1); return 0; @@ -569,10 +589,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - if (truncate_inline_inode(ipage, from)) - set_page_dirty(ipage); - if (from == 0) - clear_inode_flag(inode, FI_DATA_EXIST); + truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; @@ -621,6 +638,12 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(FAULT_TRUNCATE); + return -EIO; + } +#endif /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -642,7 +665,6 @@ int f2fs_getattr(struct vfsmount *mnt, { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); - stat->blocks <<= 3; return 0; } @@ -686,14 +708,34 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + if (is_quota_modification(inode, attr)) { + err = dquot_initialize(inode); + if (err) + return err; + } + if ((attr->ia_valid & ATTR_UID && + !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && + !gid_eq(attr->ia_gid, inode->i_gid))) { + err = dquot_transfer(inode, attr); + if (err) + return err; + } + if (attr->ia_valid & ATTR_SIZE) { - if (f2fs_encrypted_inode(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; + if (f2fs_encrypted_inode(inode)) { + err = fscrypt_get_encryption_info(inode); + if (err) + return err; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } if (attr->ia_size <= i_size_read(inode)) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); err = f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); if (err) return err; } else { @@ -701,7 +743,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); + up_write(&F2FS_I(inode)->i_mmap_sem); /* should convert inline inode here */ if (!f2fs_may_inline_data(inode)) { @@ -847,12 +891,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -883,7 +929,8 @@ next_dnode: done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + *blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (!is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { @@ -959,15 +1006,15 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.node_page, - dn.ofs_in_node); + dn.data_blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { f2fs_i_blocks_write(src_inode, - 1, false); + 1, false, false); f2fs_i_blocks_write(dst_inode, - 1, true); + 1, true, false); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, blkaddr[i], ni.version, true, false); @@ -1019,11 +1066,11 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1091,16 +1138,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - return ret; + goto out; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1113,6 +1161,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1126,7 +1176,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + if (datablock_addr(dn->inode, dn->node_page, + dn->ofs_in_node) == NULL_ADDR) count++; } @@ -1137,8 +1188,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); /* * reserve_new_blocks will not guarantee entire block * allocation. @@ -1177,9 +1228,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - return ret; + goto out_sem; truncate_pagecache_range(inode, offset, offset + len - 1); @@ -1193,17 +1245,15 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - return ret; + goto out_sem; - if (offset + len > new_size) - new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - return ret; + goto out_sem; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1252,6 +1302,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, out: if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); +out_sem: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1264,8 +1316,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) int ret = 0; new_size = i_size_read(inode) + len; - if (new_size > inode->i_sb->s_maxbytes) - return -EFBIG; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; if (offset >= i_size_read(inode)) return -EINVAL; @@ -1280,14 +1333,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); + down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) - return ret; + goto out; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); @@ -1316,6 +1370,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1435,6 +1491,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); @@ -1442,17 +1499,20 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) return 0; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) - -static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +static int f2fs_file_flush(struct file *file, fl_owner_t id) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & F2FS_REG_FLMASK; - else - return flags & F2FS_OTHER_FLMASK; + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (f2fs_is_atomic_file(inode) && + F2FS_I(inode)->inmem_task == current) + drop_inmem_pages(inode); + return 0; } static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) @@ -1481,28 +1541,34 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if (ret) return ret; - flags = f2fs_mask_flags(inode->i_mode, flags); - inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + ret = -EPERM; + goto unlock_out; + } + + flags = f2fs_mask_flags(inode->i_mode, flags); + oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - inode_unlock(inode); ret = -EPERM; - goto out; + goto unlock_out; } } flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - inode_unlock(inode); inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); -out: + f2fs_mark_inode_dirty_sync(inode, false); +unlock_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1522,6 +1588,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1536,20 +1605,27 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; set_inode_flag(inode, FI_ATOMIC_FILE); + set_inode_flag(inode, FI_HOT_DATA); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) - goto out; + goto inc_stat; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); -out: + clear_inode_flag(inode, FI_HOT_DATA); + goto out; + } + +inc_stat: + F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); +out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1580,10 +1656,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } } else { - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: inode_unlock(inode); @@ -1599,6 +1676,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1612,6 +1692,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) goto out; + stat_inc_volatile_write(inode); + stat_update_max_volatile_write(inode); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -1667,6 +1750,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } @@ -1712,7 +1796,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -1773,31 +1857,16 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(filp, &policy); + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; - struct inode *inode = file_inode(filp); - int err; - - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - - if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) @@ -1863,7 +1932,51 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync, true); + ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_range range; + u64 end; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) + return -EINVAL; +do_more: + if (!range.sync) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + } else { + mutex_lock(&sbi->gc_mutex); + } + + ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + range.start += sbi->blocks_per_seg; + if (range.start <= end) + goto do_more; out: mnt_drop_write_file(filp); return ret; @@ -1897,17 +2010,16 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; - struct extent_info ei; + struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; - unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; block_t blk_end = 0; bool fragmented = false; int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode)) + if (need_inplace_update_policy(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -1941,7 +2053,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -1965,7 +2077,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, map.m_lblk = pg_start; map.m_len = pg_end - pg_start; - sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can @@ -1983,7 +2095,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; @@ -2042,42 +2154,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) + return -EFAULT; + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) return -EINVAL; err = mnt_want_write_file(filp); if (err) return err; - if (f2fs_readonly(sbi->sb)) { - err = -EROFS; - goto out; - } - - if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, - sizeof(range))) { - err = -EFAULT; - goto out; - } - - /* verify alignment of offset & size */ - if (range.start & (F2FS_BLKSIZE - 1) || - range.len & (F2FS_BLKSIZE - 1)) { - err = -EINVAL; - goto out; - } - err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + f2fs_update_time(sbi, REQ_TIME); if (err < 0) - goto out; + return err; if (copy_to_user((struct f2fs_defragment __user *)arg, &range, sizeof(range))) - err = -EFAULT; -out: - mnt_drop_write_file(filp); - return err; + return -EFAULT; + + return 0; } static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, @@ -2211,6 +2321,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) range.pos_out, range.len); mnt_drop_write_file(filp); + if (err) + goto err_out; if (copy_to_user((struct f2fs_move_range __user *)arg, &range, sizeof(range))) @@ -2220,6 +2332,79 @@ err_out: return err; } +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + sbi->segs_per_sec != 1) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Can't flush %u in %d for segs_per_sec %u != 1\n", + range.dev_num, sbi->s_ndevs, + sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + ret = f2fs_gc(sbi, true, true, start_segno); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -2251,12 +2436,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); default: return -ENOTTY; } @@ -2269,16 +2460,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; - if (f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; - inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - int err = f2fs_preallocate_blocks(iocb, from); + int err; + if (iov_iter_fault_in_readable(from, iov_iter_count(from))) + set_inode_flag(inode, FI_NO_PREALLOC); + + err = f2fs_preallocate_blocks(iocb, from); if (err) { inode_unlock(inode); return err; @@ -2286,6 +2476,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); + clear_inode_flag(inode, FI_NO_PREALLOC); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } inode_unlock(inode); @@ -2322,10 +2516,12 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - break; case F2FS_IOC_MOVE_RANGE: + case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: break; default: return -ENOIOCTLCMD; @@ -2341,6 +2537,7 @@ const struct file_operations f2fs_file_operations = { .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7f0c3e02408c..bd16e6631cf3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -28,17 +28,23 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; - long wait_ms; + unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; + set_freezable(); do { + wait_event_interruptible_timeout(*wq, + kthread_should_stop() || freezing(current) || + gc_th->gc_wake, + msecs_to_jiffies(wait_ms)); + + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + if (try_to_freeze()) continue; - else - wait_event_interruptible_timeout(*wq, - kthread_should_stop(), - msecs_to_jiffies(wait_ms)); if (kthread_should_stop()) break; @@ -48,10 +54,15 @@ static int gc_thread_func(void *data) } #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif + if (!sb_start_write_trylock(sbi->sb)) + continue; + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -66,23 +77,28 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (!mutex_trylock(&sbi->gc_mutex)) - continue; + goto next; + + if (gc_th->gc_urgent) { + wait_ms = gc_th->urgent_sleep_time; + goto do_gc; + } if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); - continue; + goto next; } if (has_enough_invalid_blocks(sbi)) decrease_sleep_time(gc_th, &wait_ms); else increase_sleep_time(gc_th, &wait_ms); - +do_gc: stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -90,6 +106,8 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); +next: + sb_end_write(sbi->sb); } while (!kthread_should_stop()); return 0; @@ -107,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; gc_th->gc_idle = 0; + gc_th->gc_urgent = 0; + gc_th->gc_wake= 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -166,10 +187,15 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > sbi->max_victim_search) + /* we need to check every dirty segments in the FG_GC case */ + if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - p->offset = sbi->last_victim[p->gc_mode]; + /* let's select beginning hot/small space first */ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + p->offset = 0; + else + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, @@ -179,7 +205,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, if (p->alloc_mode == SSR) return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return sbi->blocks_per_seg * p->ofs_unit; + return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -199,8 +225,12 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; + + if (no_fggc_candidate(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); - return secno * sbi->segs_per_sec; + return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } @@ -208,8 +238,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SECNO(sbi, segno); - unsigned int start = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; @@ -218,7 +248,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) for (i = 0; i < sbi->segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); mtime = div_u64(mtime, sbi->segs_per_sec); vblocks = div_u64(vblocks, sbi->segs_per_sec); @@ -237,6 +267,16 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } +static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int valid_blocks = + get_valid_blocks(sbi, segno, true); + + return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + valid_blocks * 2 : valid_blocks; +} + static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -245,7 +285,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_valid_blocks(sbi, segno, sbi->segs_per_sec); + return get_greedy_cost(sbi, segno); else return get_cb_cost(sbi, segno); } @@ -274,6 +314,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); @@ -287,10 +328,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = get_max_cost(sbi, &p); + if (*result != NULL_SEGNO) { + if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && + get_valid_blocks(sbi, *result, false) && + !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + p.min_segno = *result; + goto out; + } + if (p.max_search == 0) goto out; - last_victim = sbi->last_victim[p.gc_mode]; + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -303,9 +352,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); if (segno >= last_segment) { - if (sbi->last_victim[p.gc_mode]) { - last_segment = sbi->last_victim[p.gc_mode]; - sbi->last_victim[p.gc_mode] = 0; + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } @@ -322,13 +372,15 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - - secno = GET_SECNO(sbi, segno); + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && p.alloc_mode == LFS && + no_fggc_candidate(sbi, secno)) + goto next; cost = get_gc_cost(sbi, segno, &p); @@ -338,17 +390,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } next: if (nsearched >= p.max_search) { - if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) - sbi->last_victim[p.gc_mode] = last_victim + 1; + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = last_victim + 1; else - sbi->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } if (p.min_segno != NULL_SEGNO) { got_it: if (p.alloc_mode == LFS) { - secno = GET_SECNO(sbi, p.min_segno); + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else @@ -531,12 +584,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, get_node_info(sbi, nid, dni); if (sum->version != dni->version) { - f2fs_put_page(node_page, 1); - return false; + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(node_page, ofs_in_node); + source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) @@ -544,15 +599,21 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static void move_data_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_READ, .op_flags = REQ_SYNC, .encrypted_page = NULL, + .in_list = false, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -596,7 +657,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA); + &sum, CURSEG_COLD_DATA, NULL, false); fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -634,7 +695,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); + + f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -676,10 +739,14 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_NOIDLE, + .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); int err; @@ -768,8 +835,7 @@ next_step: continue; /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -803,14 +869,18 @@ next_step: continue; } locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); } start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx, segno, off); + if (f2fs_encrypted_file(inode)) + move_data_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type, segno, off); + move_data_page(inode, start_bidx, gc_type, + segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); @@ -847,7 +917,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int sec_freed = 0; + int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -871,7 +941,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, 1) == 0 || + if (get_valid_blocks(sbi, segno, false) == 0 || !PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) goto next; @@ -886,7 +956,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - mutex_lock(sentry_lock) - change_curseg() * - lock_page(sum_page) */ - if (type == SUM_TYPE_NODE) gc_node_segment(sbi, sum->entries, segno, gc_type); else @@ -894,90 +963,113 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); + + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; next: f2fs_put_page(sum_page, 0); } if (gc_type == FG_GC) - f2fs_submit_merged_bio(sbi, - (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + f2fs_submit_merged_write(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA); blk_finish_plug(&plug); - if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) - sec_freed = 1; - stat_inc_call_count(sbi->stat_info); - return sec_freed; + return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, + bool background, unsigned int segno) { - unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0; - int ret = -EINVAL; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; struct cp_control cpc; + unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + trace_f2fs_gc_begin(sbi->sb, sync, background, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + cpc.reason = __get_cp_reason(sbi); gc_more: - segno = NULL_SEGNO; - - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { + ret = -EINVAL; goto stop; + } if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { - gc_type = FG_GC; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { /* - * If there is no victim and no prefree segment but still not - * enough free sections, we should flush dent/node blocks and do - * garbage collections. + * For example, if there are many prefree_segments below given + * threshold, we can make them free by checkpoint. Then, we + * secure free segments which doesn't need fggc any more. */ - if (__get_victim(sbi, &segno, gc_type) || - prefree_segments(sbi)) { - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; - segno = NULL_SEGNO; - } else if (has_not_enough_free_secs(sbi, 0, 0)) { + if (prefree_segments(sbi)) { ret = write_checkpoint(sbi, &cpc); if (ret) goto stop; } - } else if (gc_type == BG_GC && !background) { - /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (has_not_enough_free_secs(sbi, 0, 0)) + gc_type = FG_GC; + } + + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (gc_type == BG_GC && !background) { + ret = -EINVAL; + goto stop; + } + if (!__get_victim(sbi, &segno, gc_type)) { + ret = -ENODATA; goto stop; } - if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) - goto stop; - ret = 0; - - if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && - gc_type == FG_GC) + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) sec_freed++; + total_freed += seg_freed; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + segno = NULL_SEGNO; goto gc_more; + } if (gc_type == FG_GC) ret = write_checkpoint(sbi, &cpc); } stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); @@ -989,5 +1081,20 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { + u64 main_count, resv_count, ovp_count; + DIRTY_I(sbi)->v_ops = &default_v_ops; + + /* threshold of # of valid blocks in a section for victims of FG_GC */ + main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; + resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; + ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * + BLKS_PER_SEC(sbi), (main_count - resv_count)); + + /* give warm/cold data area from slower device */ + if (sbi->s_ndevs && sbi->segs_per_sec == 1) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index a993967dcdb9..9325191fab2d 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,6 +13,7 @@ * whether IO subsystem is idle * or not */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ @@ -27,12 +28,15 @@ struct f2fs_gc_kthread { wait_queue_head_t gc_wait_queue_head; /* for gc sleep time */ + unsigned int urgent_sleep_time; unsigned int min_sleep_time; unsigned int max_sleep_time; unsigned int no_gc_sleep_time; /* for changing gc mode */ unsigned int gc_idle; + unsigned int gc_urgent; + unsigned int gc_wake; }; struct gc_inode_list { @@ -65,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) } static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) return; - *wait += gc_th->min_sleep_time; - if (*wait > gc_th->max_sleep_time) - *wait = gc_th->max_sleep_time; + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; } static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) *wait = gc_th->max_sleep_time; - *wait -= gc_th->min_sleep_time; - if (*wait <= gc_th->min_sleep_time) - *wait = gc_th->min_sleep_time; + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 71b7206c431e..eb2e031ea887 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index fb5d7d1f34aa..fbf22b0f667f 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -23,10 +23,10 @@ bool f2fs_may_inline_data(struct inode *inode) if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA) + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return false; return true; @@ -45,6 +45,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) void read_inline_data(struct page *page, struct page *ipage) { + struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; if (PageUptodate(page)) @@ -52,31 +53,33 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); + src_addr = inline_data_addr(inode, ipage); dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); flush_dcache_page(page); kunmap_atomic(dst_addr); if (!PageUptodate(page)) SetPageUptodate(page); } -bool truncate_inline_inode(struct page *ipage, u64 from) +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; - if (from >= MAX_INLINE_DATA) - return false; + if (from >= MAX_INLINE_DATA(inode)) + return; - addr = inline_data_addr(ipage); + addr = inline_data_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); - memset(addr + from, 0, MAX_INLINE_DATA - from); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); - return true; + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -132,6 +135,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .page = page, .encrypted_page = NULL, + .io_type = FS_DATA_IO, }; int dirty, err; @@ -153,6 +157,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { @@ -164,11 +169,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode_page, 0); + truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); - f2fs_clear_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); f2fs_put_dnode(dn); return 0; } @@ -215,6 +220,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; + struct address_space *mapping = page_mapping(page); + unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -231,11 +238,16 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + dst_addr = inline_data_addr(inode, dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -270,9 +282,9 @@ process_inline: f2fs_wait_on_page_writeback(ipage, NODE, true); - src_addr = inline_data_addr(npage); - dst_addr = inline_data_addr(ipage); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); set_inode_flag(inode, FI_INLINE_DATA); set_inode_flag(inode, FI_DATA_EXIST); @@ -285,9 +297,8 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - if (!truncate_inline_inode(ipage, 0)) - return false; - f2fs_clear_inline_inode(inode); + truncate_inline_inode(inode, ipage, 0); + clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -301,11 +312,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct f2fs_inline_dentry *inline_dentry; struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + void *inline_dentry; f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); @@ -314,11 +325,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(dir, ipage); - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(dir, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -332,19 +343,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *dentry_blk; struct f2fs_dentry_ptr d; + void *inline_dentry; - dentry_blk = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) - f2fs_i_size_write(inode, MAX_INLINE_DATA); + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); return 0; } @@ -353,11 +364,12 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * release ipage in this function. */ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { struct page *page; struct dnode_of_data dn; struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; int err; page = f2fs_grab_cache_page(dir->i_mapping, 0, false); @@ -372,25 +384,24 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = kmap_atomic(page); + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); + /* copy data from inline dentry block to new dentry block */ - memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, - INLINE_DENTRY_BITMAP_SIZE); - memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, - SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); /* * we do not need to zero out remainder part of dentry and filename * field, since we have used bitmap for marking the usage status of * them, besides, we can also ignore copying/zeroing reserved space * of dentry block, because them haven't been used so far. */ - memcpy(dentry_blk->dentry, inline_dentry->dentry, - sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); - memcpy(dentry_blk->filename, inline_dentry->filename, - NR_INLINE_DENTRY * F2FS_SLOT_LEN); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); if (!PageUptodate(page)) @@ -398,7 +409,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -411,14 +422,13 @@ out: return err; } -static int f2fs_add_inline_entries(struct inode *dir, - struct f2fs_inline_dentry *inline_dentry) +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) { struct f2fs_dentry_ptr d; unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(dir, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -460,20 +470,20 @@ punch_dentry_pages: } static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { - struct f2fs_inline_dentry *backup_dentry; + void *backup_dentry; int err; backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), - sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; } - memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); - truncate_inline_inode(ipage, 0); + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); + truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -489,9 +499,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); - memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); - f2fs_i_size_write(dir, MAX_INLINE_DATA); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); @@ -500,7 +510,7 @@ recover: } static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) return f2fs_move_inline_dirents(dir, ipage, inline_dentry); @@ -516,7 +526,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *dentry_blk = NULL; + void *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -526,11 +536,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - dentry_blk = inline_data_addr(ipage); - bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, - slots, NR_INLINE_DENTRY); - if (bit_pos >= NR_INLINE_DENTRY) { - err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { + err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; err = -EAGAIN; @@ -545,14 +556,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(new_name); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + name_hash = f2fs_dentry_hash(new_name, NULL); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -575,7 +583,8 @@ out: void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { - struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; + void *inline_dentry; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -583,11 +592,12 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, lock_page(page); f2fs_wait_on_page_writeback(page, NODE, true); - inline_dentry = inline_data_addr(page); - bit_pos = dentry - inline_dentry->dentry; + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) - __clear_bit_le(bit_pos + i, - &inline_dentry->dentry_bitmap); + __clear_bit_le(bit_pos + i, d.bitmap); set_page_dirty(page); f2fs_put_page(page, 1); @@ -604,20 +614,21 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *dentry_blk; + void *inline_dentry; + struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - dentry_blk = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_INLINE_DENTRY, - bit_pos); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); f2fs_put_page(ipage, 1); - if (bit_pos < NR_INLINE_DENTRY) + if (bit_pos < d.max) return false; return true; @@ -627,25 +638,27 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); - struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; int err; - if (ctx->pos == NR_INLINE_DENTRY) + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) return 0; ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) - ctx->pos = NR_INLINE_DENTRY; + ctx->pos = d.max; f2fs_put_page(ipage, 1); return err < 0 ? err : 0; @@ -670,7 +683,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, goto out; } - ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); if (start >= ilen) goto out; if (start + len < ilen) @@ -679,7 +692,8 @@ int f2fs_inline_data_fiemap(struct inode *inode, get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; - byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); out: f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index af06bda51a54..50c88e37ed66 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,6 +16,7 @@ #include "f2fs.h" #include "node.h" +#include "segment.h" #include @@ -44,25 +45,26 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(ri->i_addr[0])); + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); else - inode->i_rdev = - new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); } } static bool __written_first_block(struct f2fs_inode *ri) { - block_t addr = le32_to_cpu(ri->i_addr[0]); + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); if (addr != NEW_ADDR && addr != NULL_ADDR) return true; @@ -71,25 +73,27 @@ static bool __written_first_block(struct f2fs_inode *ri) static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = + ri->i_addr[extra_size] = cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; + ri->i_addr[extra_size + 1] = 0; } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; + ri->i_addr[extra_size + 2] = 0; } } } static void __recover_inline_status(struct inode *inode, struct page *ipage) { - void *inline_data = inline_data_addr(ipage); + void *inline_data = inline_data_addr(inode, ipage); __le32 *start = inline_data; - __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); while (start < end) { if (*start++) { @@ -104,12 +108,84 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) return; } +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + int extra_isize = le32_to_cpu(ri->i_extra_isize); + + if (!f2fs_sb_has_inode_chksum(sbi->sb)) + return false; + + if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_msg(sbi->sb, KERN_WARNING, + "checksum invalid, ino = %x, %x vs. %x", + ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; + projid_t i_projid; /* Check if ino is within scope */ if (check_nid_range(sbi, inode->i_ino)) { @@ -130,7 +206,7 @@ static int do_read_inode(struct inode *inode) i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); - inode->i_blocks = le64_to_cpu(ri->i_blocks); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); @@ -153,6 +229,9 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -166,6 +245,16 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -226,6 +315,7 @@ make_now: ret = -EIO; goto bad_inode; } + f2fs_set_inode_flags(inode); unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; @@ -267,7 +357,7 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); - ri->i_blocks = cpu_to_le64(inode->i_blocks); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); if (et) { read_lock(&et->lock); @@ -291,6 +381,20 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; + if (f2fs_has_extra_attr(inode)) { + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + } + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); @@ -316,7 +420,6 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -339,7 +442,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode) && wbc && wbc->nr_to_write) + update_inode_page(inode); + if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } @@ -372,10 +476,7 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) - goto no_delete; -#endif + dquot_initialize(inode); remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -387,6 +488,12 @@ retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); + err = -EIO; + } +#endif if (!err) { f2fs_lock_op(sbi); err = remove_inode_page(inode); @@ -403,13 +510,22 @@ retry: if (err) update_inode_page(inode); + dquot_free_inode(inode); sb_end_intwrite(inode->i_sb); no_delete: + dquot_drop(inode); + stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + + /* ino == 0, if f2fs_new_inode() was failed t*/ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { @@ -421,9 +537,10 @@ no_delete: if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); + } else { + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); } - f2fs_bug_on(sbi, err && - !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); @@ -446,6 +563,7 @@ void handle_failed_inode(struct inode *inode) * in a panic when flushing dirty inodes in gdirty_list. */ update_inode_page(inode); + f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index db3079cd665d..d92b8e9064cb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -42,6 +43,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } f2fs_unlock_op(sbi); + nid_free = true; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; @@ -52,16 +55,35 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) err = insert_inode_locked(inode); if (err) { err = -EINVAL; - nid_free = true; goto fail; } + if (f2fs_sb_has_project_quota(sbi->sb) && + (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); + + err = dquot_initialize(inode); + if (err) + goto fail_drop; + + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + /* If the directory encrypted, then we should encrypt the inode. */ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); set_inode_flag(inode, FI_NEW_INODE); + if (f2fs_sb_has_extra_attr(sbi->sb)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) @@ -75,6 +97,15 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + trace_f2fs_new_inode(inode, 0); return inode; @@ -85,6 +116,16 @@ fail: set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); } static int is_multimedia_file(const unsigned char *s, const char *sub) @@ -136,6 +177,10 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -148,8 +193,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -163,6 +206,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -180,6 +225,15 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); inode->i_ctime = current_time(inode); @@ -233,6 +287,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -324,9 +382,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - bool nokey = f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - err = nokey ? -ENOKEY : -EPERM; + f2fs_msg(inode->i_sb, KERN_WARNING, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; goto err_out; } return d_splice_alias(inode, dentry); @@ -346,6 +405,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + err = dquot_initialize(dir); + if (err) + return err; + de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { if (IS_ERR(page)) @@ -400,7 +463,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); @@ -409,6 +472,10 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (disk_link.len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -420,8 +487,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -444,7 +509,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, goto err_out; if (!fscrypt_has_encryption_key(inode)) { - err = -EPERM; + err = -ENOKEY; goto err_out; } @@ -484,6 +549,8 @@ err_out: } kfree(sd); + + f2fs_balance_fs(sbi, true); return err; out: handle_failed_inode(inode); @@ -496,6 +563,10 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -505,8 +576,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - f2fs_balance_fs(sbi, true); - set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -521,6 +590,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out_fail: @@ -544,6 +615,10 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -551,8 +626,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -566,6 +639,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -579,6 +654,10 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -592,8 +671,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -619,6 +696,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + + f2fs_balance_fs(sbi, true); return 0; release_out: @@ -672,6 +751,19 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -717,13 +809,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - err = update_dent_inode(old_inode, new_inode, - &new_dentry->d_name); - if (err) { - release_orphan_inode(sbi); - goto put_out_dir; - } - f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = current_time(new_inode); @@ -775,9 +860,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } down_write(&F2FS_I(old_inode)->i_sem); - file_lost_pino(old_inode); - if (new_inode && file_enc_name(new_inode)) - file_set_enc_name(old_inode); + if (!old_dir_entry || whiteout) + file_lost_pino(old_inode); + else + F2FS_I(old_inode)->i_pino = new_dir->i_ino; up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -858,6 +944,22 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -905,8 +1007,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old_nlink = old_dir_entry ? -1 : 1; new_nlink = -old_nlink; err = -EMLINK; - if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || - (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) goto out_new_dir; } @@ -914,18 +1016,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); - if (err) - goto out_unlock; - if (file_enc_name(new_inode)) - file_set_enc_name(old_inode); - - err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); - if (err) - goto out_undo; - if (file_enc_name(old_inode)) - file_set_enc_name(new_inode); - /* update ".." directory entry info of old dentry */ if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); @@ -941,7 +1031,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_dir->i_ctime = CURRENT_TIME; + old_dir->i_ctime = current_time(old_dir); if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); @@ -956,7 +1046,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - new_dir->i_ctime = CURRENT_TIME; + new_dir->i_ctime = current_time(new_dir); if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); @@ -969,14 +1059,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); return 0; -out_undo: - /* - * Still we may fail to recover name info of f2fs_inode here - * Drop it, once its name is set as encrypted - */ - update_dent_inode(old_inode, old_inode, &old_dentry->d_name); -out_unlock: - f2fs_unlock_op(sbi); out_new_dir: if (new_dir_entry) { f2fs_dentry_kunmap(new_inode, new_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc67dc323f7e..32474db18ad9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,10 +19,11 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include "trace.h" #include -#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) +#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -63,8 +64,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) int i; for (i = 0; i <= UPDATE_INO; i++) - mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_SHIFT; + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * @@ -157,9 +159,6 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; - if (get_nat_flag(ne, IS_DIRTY)) - return; - head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); @@ -170,25 +169,27 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } - list_move_tail(&ne->list, &head->entry_list); + + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + nm_i->dirty_nat_cnt++; head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); +refresh_list: + if (nat_get_blkaddr(ne) == NEW_ADDR) + list_del_init(&ne->list); + else + list_move_tail(&ne->list, &head->entry_list); } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry_set *set, struct nat_entry *ne) { - nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); - struct nat_entry_set *head; - - head = radix_tree_lookup(&nm_i->nat_set_root, set); - if (head) { - list_move_tail(&ne->list, &nm_i->nat_entries); - set_nat_flag(ne, IS_DIRTY, false); - head->entry_cnt--; - nm_i->dirty_nat_cnt--; - } + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->dirty_nat_cnt--; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -381,6 +382,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct page *page = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; + pgoff_t index; int i; ni->nid = nid; @@ -406,17 +408,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); } up_read(&curseg->journal_rwsem); - if (i >= 0) + if (i >= 0) { + up_read(&nm_i->nat_tree_lock); goto cache; + } /* Fill node_info from nat page */ - page = get_current_nat_page(sbi, start_nid); + index = current_nat_addr(sbi, nid); + up_read(&nm_i->nat_tree_lock); + + page = get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); /* cache nat entry */ down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); @@ -549,7 +555,7 @@ static int get_node_path(struct inode *inode, long block, level = 3; goto got; } else { - BUG(); + return -E2BIG; } got: return level; @@ -573,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int err = 0; level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -608,7 +616,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i], NULL); + npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); @@ -649,7 +657,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); return 0; release_pages: @@ -673,15 +682,11 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); - if (dn->inode->i_blocks == 0) { - f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR); - goto invalidate; - } f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode); + dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { @@ -689,7 +694,7 @@ static void truncate_node(struct dnode_of_data *dn) dec_valid_inode_count(sbi); f2fs_inode_synced(dn->inode); } -invalidate: + clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -875,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); + if (level < 0) + return level; page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -971,9 +978,6 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - /* need to do checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); - set_new_dnode(&dn, inode, page, npage, nid); if (page) @@ -1009,7 +1013,7 @@ int remove_inode_page(struct inode *inode) /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(F2FS_I_SB(inode), - inode->i_blocks != 0 && inode->i_blocks != 1); + inode->i_blocks != 0 && inode->i_blocks != 8); /* will put inode & node pages */ truncate_node(&dn); @@ -1024,14 +1028,13 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0, NULL); + return new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage) +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct node_info old_ni, new_ni; + struct node_info new_ni; struct page *page; int err; @@ -1042,17 +1045,18 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { - err = -ENOSPC; + if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) goto fail; - } - get_node_info(sbi, dn->nid, &old_ni); - - /* Reinitialize old_ni with new node page */ - f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR); - new_ni = old_ni; +#ifdef CONFIG_F2FS_CHECK_FS + get_node_info(sbi, dn->nid, &new_ni); + f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); +#endif + new_ni.nid = dn->nid; new_ni.ino = dn->inode->i_ino; + new_ni.blk_addr = NULL_ADDR; + new_ni.flag = 0; + new_ni.version = 0; set_node_addr(sbi, &new_ni, NEW_ADDR, false); f2fs_wait_on_page_writeback(page, NODE, true); @@ -1153,6 +1157,7 @@ repeat: f2fs_put_page(page, 1); return ERR_PTR(err); } else if (err == LOCKED_PAGE) { + err = 0; goto page_hit; } @@ -1166,15 +1171,27 @@ repeat: goto repeat; } - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + err = -EIO; goto out_err; + } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EBADMSG; + goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { - f2fs_bug_on(sbi, 1); - ClearPageUptodate(page); + f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " + "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); + err = -EINVAL; out_err: + ClearPageUptodate(page); f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + return ERR_PTR(err); } return page; } @@ -1318,16 +1335,103 @@ continue_unlock: return last_page; } +static int __write_node_page(struct page *page, bool atomic, bool *submitted, + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = NODE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + .submitted = false, + .io_type = io_type, + }; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + + /* get old block addr of this node page */ + nid = nid_of_node(page); + f2fs_bug_on(sbi, page->index != nid); + + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + + get_node_info(sbi, nid, &ni); + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + unlock_page(page); + return 0; + } + + if (atomic && !test_opt(sbi, NOBARRIER)) + fio.op_flags |= WRITE_FLUSH_FUA; + + set_page_writeback(page); + fio.old_blkaddr = ni.blk_addr; + write_node_page(nid, &fio); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + + if (wbc->for_reclaim) { + f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, + page->index, NODE); + submitted = NULL; + } + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, NODE); + submitted = NULL; + } + if (submitted) + *submitted = fio.submitted; + + if (do_balance) + f2fs_balance_fs(sbi, false); + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); +} + int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index, end; + pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1349,6 +1453,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); @@ -1380,6 +1485,9 @@ continue_unlock: f2fs_wait_on_page_writeback(page, NODE, true); BUG_ON(PageWriteback(page)); + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + if (!atomic || page == last_page) { set_fsync_mark(page, 1); if (IS_INODE(page)) { @@ -1397,13 +1505,16 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + ret = __write_node_page(page, atomic && + page == last_page, + &submitted, wbc, true, + FS_NODE_IO); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); break; - } else { - nwritten++; + } else if (submitted) { + last_idx = page->index; } if (page == last_page) { @@ -1429,12 +1540,13 @@ continue_unlock: goto retry; } out: - if (nwritten) - f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type) { pgoff_t index, end; struct pagevec pvec; @@ -1458,6 +1570,7 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { pagevec_release(&pvec); @@ -1511,9 +1624,11 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); - if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + ret = __write_node_page(page, false, &submitted, + wbc, do_balance, io_type); + if (ret) unlock_page(page); - else + else if (submitted) nwritten++; if (--wbc->nr_to_write == 0) @@ -1534,7 +1649,7 @@ continue_unlock: } out: if (nwritten) - f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_write(sbi, NODE); return ret; } @@ -1580,72 +1695,6 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) return ret; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(page); - nid_t nid; - struct node_info ni; - struct f2fs_io_info fio = { - .sbi = sbi, - .type = NODE, - .op = REQ_OP_WRITE, - .op_flags = wbc_to_write_flags(wbc), - .page = page, - .encrypted_page = NULL, - }; - - trace_f2fs_writepage(page, NODE); - - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; - - /* get old block addr of this node page */ - nid = nid_of_node(page); - f2fs_bug_on(sbi, page->index != nid); - - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - - get_node_info(sbi, nid, &ni); - - /* This page is already truncated */ - if (unlikely(ni.blk_addr == NULL_ADDR)) { - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - unlock_page(page); - return 0; - } - - set_page_writeback(page); - fio.old_blkaddr = ni.blk_addr; - write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - - if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); - - unlock_page(page); - - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, NODE, WRITE); - - return 0; - -redirty_out: - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1653,6 +1702,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct blk_plug plug; long diff; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1665,7 +1717,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc); + sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -1743,43 +1795,71 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, radix_tree_delete(&nm_i->free_nid_root, i->nid); } -static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +/* return if the nid is recognized as free */ +static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i; + struct free_nid *i, *e; struct nat_entry *ne; - int err; + int err = -EINVAL; + bool ret = false; /* 0 nid should not be used */ if (unlikely(nid == 0)) - return 0; - - if (build) { - /* do not add allocated nids */ - ne = __lookup_nat_cache(nm_i, nid); - if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || - nat_get_blkaddr(ne) != NULL_ADDR)) - return 0; - } + return false; i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; - if (radix_tree_preload(GFP_NOFS)) { - kmem_cache_free(free_nid_slab, i); - return 0; - } + if (radix_tree_preload(GFP_NOFS)) + goto err; spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - alloc_nid + * - __insert_nid_to_list(ALLOC_NID_LIST) + * - f2fs_balance_fs_bg + * - build_free_nids + * - __build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - init_inode_metadata + * - new_inode_page + * - new_node_page + * - set_node_addr + * - alloc_nid_done + * - __remove_nid_from_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(FREE_NID_LIST) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == NID_NEW) + ret = true; + goto err_out; + } + } + ret = true; err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); +err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); - if (err) { +err: + if (err) kmem_cache_free(free_nid_slab, i); - return 0; - } - return 1; + return ret; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) @@ -1800,17 +1880,45 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + else + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + + if (set) + nm_i->free_nid_count[nat_ofs]++; + else if (!build) + nm_i->free_nid_count[nat_ofs]--; +} + static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; + unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + bool freed = false; if (unlikely(start_nid >= nm_i->max_nid)) break; @@ -1818,11 +1926,58 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) - add_free_nid(sbi, start_nid, true); + freed = add_free_nid(sbi, start_nid, true); + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, freed, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } } -static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + unsigned int i, idx; + + down_read(&nm_i->nat_tree_lock); + + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) + continue; + if (!nm_i->free_nid_count[i]) + continue; + for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { + nid_t nid; + + if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) + continue; + + nid = i * NAT_ENTRY_PER_BLOCK + idx; + add_free_nid(sbi, nid, true); + + if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) + goto out; + } + } +out: + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); +} + +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1830,6 +1985,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) int i = 0; nid_t nid = nm_i->next_scan_nid; + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; @@ -1837,6 +1995,14 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) if (!sync && !available_free_memory(sbi, FREE_NIDS)) return; + if (!mount) { + /* try to find free nids in free_nid_bitmap */ + scan_free_nid_bits(sbi); + + if (nm_i->nid_cnt[FREE_NID_LIST]) + return; + } + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1879,10 +2045,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi, bool sync) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi, sync); + __build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1897,8 +2063,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_ALLOC_NID)) + if (time_to_inject(sbi, FAULT_ALLOC_NID)) { + f2fs_show_injection_info(FAULT_ALLOC_NID); return false; + } #endif spin_lock(&nm_i->nid_list_lock); @@ -1918,13 +2086,16 @@ retry: i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; + + update_free_nid_bitmap(sbi, *nid, false, false); + spin_unlock(&nm_i->nid_list_lock); return true; } spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi, true); + build_free_nids(sbi, true, false); goto retry; } @@ -1972,6 +2143,8 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; + update_free_nid_bitmap(sbi, nid, true, false); + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2034,38 +2207,47 @@ update_inode: f2fs_put_page(ipage, 1); } -void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; - nid_t new_xnid = nid_of_node(page); + nid_t new_xnid; + struct dnode_of_data dn; struct node_info ni; + struct page *xpage; - /* 1: invalidate the previous xattr nid */ if (!prev_xnid) goto recover_xnid; - /* Deallocate node address */ + /* 1: invalidate the previous xattr nid */ get_node_info(sbi, prev_xnid, &ni); f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, inode); + dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: - /* 2: allocate new xattr nid */ - if (unlikely(!inc_valid_node_count(sbi, inode))) - f2fs_bug_on(sbi, 1); + /* 2: update xattr nid in inode */ + if (!alloc_nid(sbi, &new_xnid)) + return -ENOSPC; - remove_free_nid(sbi, new_xnid); - get_node_info(sbi, new_xnid, &ni); - ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR, false); - f2fs_i_xnid_write(inode, new_xnid); + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } - /* 3: update xattr blkaddr */ - refresh_sit_entry(sbi, NEW_ADDR, blkaddr); - set_node_addr(sbi, &ni, blkaddr, false); + alloc_nid_done(sbi, new_xnid); + update_inode_page(inode); + + /* 3: update and set xattr node page dirty */ + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); + + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); + + return 0; } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -2101,12 +2283,19 @@ retry: dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; - dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) { + dst->i_extra_isize = src->i_extra_isize; + if (f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + } new_ni = old_ni; new_ni.ino = ino; - if (unlikely(!inc_valid_node_count(sbi, NULL))) + if (unlikely(inc_valid_node_count(sbi, NULL, true))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); @@ -2208,8 +2397,39 @@ add_out: list_add_tail(&nes->set_list, head); } +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, + struct page *page) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; + struct f2fs_nat_block *nat_blk = page_address(page); + int valid = 0; + int i; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { + if (start_nid == 0 && i == 0) + valid++; + if (nat_blk->entries[i].block_addr) + valid++; + } + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); +} + static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, - struct nat_entry_set *set) + struct nat_entry_set *set, struct cp_control *cpc) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; @@ -2224,7 +2444,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { @@ -2241,8 +2462,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, nid_t nid = nat_get_nid(ne); int offset; - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; + f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { offset = lookup_journal_in_cursum(journal, @@ -2255,30 +2475,38 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); - __clear_nat_cache_dirty(NM_I(sbi), ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; + update_free_nid_bitmap(sbi, nid, true, false); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } - if (to_journal) + if (to_journal) { up_write(&curseg->journal_rwsem); - else + } else { + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); + } - f2fs_bug_on(sbi, set->entry_cnt); - - radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - kmem_cache_free(nat_entry_set_slab, set); + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } } /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2299,7 +2527,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, @@ -2313,11 +2542,86 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) - __flush_nat_entry_set(sbi, set); + __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); + /* Allow dirty nats by node block allocation in write_begin */ +} - f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; + unsigned int i; + __u64 cp_ver = cur_cp_version(ckpt); + block_t nat_bits_addr; + + if (!enabled_nat_bits(sbi, NULL)) + return 0; + + nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + + F2FS_BLKSIZE - 1); + nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, + GFP_KERNEL); + if (!nm_i->nat_bits) + return -ENOMEM; + + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - + nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) { + struct page *page = get_meta_page(sbi, nat_bits_addr++); + + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), + page_address(page), F2FS_BLKSIZE); + f2fs_put_page(page, 1); + } + + cp_ver |= (cur_cp_crc(ckpt) << 32); + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { + disable_nat_bits(sbi, true); + return 0; + } + + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint"); + return 0; +} + +static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i = 0; + nid_t nid, last_nid; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + + nid = i * NAT_ENTRY_PER_BLOCK; + last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + + spin_lock(&NM_I(sbi)->nid_list_lock); + for (; nid < last_nid; nid++) + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + } } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -2325,15 +2629,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi) struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned char *version_bitmap; - unsigned int nat_segs, nat_blocks; + unsigned int nat_segs; + int err; nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; - nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - @@ -2367,6 +2671,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi) if (!nm_i->nat_bitmap) return -ENOMEM; + err = __get_nat_bitmaps(sbi); + if (err) + return err; + #ifdef CONFIG_F2FS_CHECK_FS nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, GFP_KERNEL); @@ -2377,6 +2685,27 @@ static int init_node_manager(struct f2fs_sb_info *sbi) return 0; } +static int init_free_nid_cache(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * + NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + + nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, + GFP_KERNEL); + if (!nm_i->nat_block_bitmap) + return -ENOMEM; + + nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * + sizeof(unsigned short), GFP_KERNEL); + if (!nm_i->free_nid_count) + return -ENOMEM; + return 0; +} + int build_node_manager(struct f2fs_sb_info *sbi) { int err; @@ -2389,7 +2718,14 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi, true); + err = init_free_nid_cache(sbi); + if (err) + return err; + + /* load free nid status from nat_bits table */ + load_free_nid_bitmap(sbi); + + build_free_nids(sbi, true, true); return 0; } @@ -2447,7 +2783,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) } up_write(&nm_i->nat_tree_lock); + kvfree(nm_i->nat_block_bitmap); + kvfree(nm_i->free_nid_bitmap); + kvfree(nm_i->free_nid_count); + kfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS kfree(nm_i->nat_bitmap_mir); #endif diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 29ff783eb9c3..bb53e9955ff2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -9,10 +9,10 @@ * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ -#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ -#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 @@ -62,16 +62,16 @@ struct nat_entry { struct node_info ni; /* in-memory node information */ }; -#define nat_get_nid(nat) (nat->ni.nid) -#define nat_set_nid(nat, n) (nat->ni.nid = n) -#define nat_get_blkaddr(nat) (nat->ni.blk_addr) -#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) -#define nat_get_ino(nat) (nat->ni.ino) -#define nat_set_ino(nat, i) (nat->ni.ino = i) -#define nat_get_version(nat) (nat->ni.version) -#define nat_set_version(nat, v) (nat->ni.version = v) +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) -#define inc_node_version(version) (++version) +#define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) @@ -200,21 +200,18 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; - int seg_off; + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ block_off = NAT_BLOCK_OFFSET(start); - seg_off = block_off >> sbi->log_blocks_per_seg; block_addr = (pgoff_t)(nm_i->nat_blkaddr + - (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off << 1) - (block_off & (sbi->blocks_per_seg - 1))); -#ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_bit(block_off, nm_i->nat_bitmap) != - f2fs_test_bit(block_off, nm_i->nat_bitmap_mir)) - f2fs_bug_on(sbi, 1); -#endif - if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -227,11 +224,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - if ((block_addr >> sbi->log_blocks_per_seg) % 2) - block_addr -= sbi->blocks_per_seg; - else - block_addr += sbi->blocks_per_seg; - + block_addr ^= 1 << sbi->log_blocks_per_seg; return block_addr + nm_i->nat_blkaddr; } @@ -306,14 +299,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); - __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); + __u64 cp_ver = cur_cp_version(ckpt); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } @@ -321,14 +311,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) static inline bool is_recoverable_dnode(struct page *page) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); __u64 cp_ver = cur_cp_version(ckpt); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + return cp_ver == cpver_of_node(page); } @@ -358,7 +345,7 @@ static inline bool IS_DNODE(struct page *node_page) unsigned int ofs = ofs_of_node(node_page); if (f2fs_has_xattr_block(ofs)) - return false; + return true; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4a3c48c24c10..9626758bc762 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, } static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, - struct list_head *head, nid_t ino) + struct list_head *head, nid_t ino, bool quota_inode) { struct inode *inode; struct fsync_inode_entry *entry; + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + err = dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); return entry; +err_out: + iput(inode); + return ERR_PTR(err); } static void del_fsync_inode(struct fsync_inode_entry *entry) @@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage, entry = get_fsync_inode(dir_list, pino); if (!entry) { - entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); if (IS_ERR(entry)) { dir = ERR_CAST(entry); err = PTR_ERR(entry); @@ -140,6 +155,13 @@ retry: err = -EEXIST; goto out_unmap_put; } + + err = dquot_initialize(einode); + if (err) { + iput(einode); + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); @@ -198,7 +220,8 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) { struct curseg_info *curseg; struct page *page = NULL; @@ -225,17 +248,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { - if (IS_INODE(page) && is_dent_dnode(page)) { + bool quota_inode = false; + + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; + quota_inode = true; } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(page)); + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); if (err == -ENOENT) { @@ -326,10 +354,18 @@ got_it: f2fs_put_page(node_page, 1); if (ino != dn->inode->i_ino) { + int ret; + /* Deallocate previous index in the node page */ inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); + + ret = dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } } else { inode = dn->inode; } @@ -359,7 +395,8 @@ out: return 0; truncate_out: - if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + if (datablock_addr(tdn.inode, tdn.node_page, + tdn.ofs_in_node) == blkaddr) truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -378,11 +415,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - /* - * Deprecated; xattr blocks should be found from cold log. - * But, we should remain this for backward compatibility. - */ - recover_xattr_data(inode, page, blkaddr); + err = recover_xattr_data(inode, page, blkaddr); + if (!err) + recovered++; goto out; } @@ -414,8 +449,8 @@ retry_dn: for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.node_page, dn.ofs_in_node); - dest = datablock_addr(page, dn.ofs_in_node); + src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + dest = datablock_addr(dn.inode, page, dn.ofs_in_node); /* skip recovering if dest is the same as src */ if (src == dest) @@ -428,8 +463,9 @@ retry_dn: } if (!file_keep_isize(inode) && - (i_size_read(inode) <= (start << PAGE_SHIFT))) - f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + f2fs_i_size_write(inode, + (loff_t)(start + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block @@ -556,12 +592,27 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) struct list_head dir_list; int err; int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; + if (!fsync_entry_slab) { + err = -ENOMEM; + goto out; + } INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&dir_list); @@ -570,13 +621,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) mutex_lock(&sbi->cp_mutex); /* step #1: find fsynced inode numbers */ - err = find_fsync_dnodes(sbi, &inode_list); + err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) - goto out; + goto skip; if (check_only) { ret = 1; - goto out; + goto skip; } need_writecp = true; @@ -585,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); -out: +skip: destroy_fsync_dnodes(&inode_list); /* truncate meta pages to be used by the recovery */ @@ -598,8 +649,6 @@ out: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) - set_ckpt_flags(sbi, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ @@ -613,5 +662,12 @@ out: } kmem_cache_destroy(fsync_entry_slab); +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 289b3facd2d8..059a219b7740 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,10 +16,13 @@ #include #include #include +#include +#include #include "f2fs.h" #include "segment.h" #include "node.h" +#include "gc.h" #include "trace.h" #include @@ -166,6 +169,21 @@ found: return result - size + __reverse_ffz(tmp); } +bool need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (test_opt(sbi, LFS)) + return false; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + 2 * reserved_sections(sbi)); +} + void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -212,9 +230,15 @@ static int __revoke_inmem_pages(struct inode *inode, struct node_info ni; trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); - +retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } err = -EAGAIN; goto next; } @@ -247,9 +271,40 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } +void drop_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct list_head *head = &fi->inmem_pages; + struct inmem_pages *cur = NULL; + + f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry(cur, head, list) { + if (cur->page == page) + break; + } + + f2fs_bug_on(sbi, !cur || cur->page != page); + list_del(&cur->list); + mutex_unlock(&fi->inmem_lock); + + dec_page_count(sbi, F2FS_INMEM_PAGES); + kmem_cache_free(inmem_entry_slab, cur); + + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); + + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); +} + static int __commit_inmem_pages(struct inode *inode, struct list_head *revoke_list) { @@ -260,10 +315,10 @@ static int __commit_inmem_pages(struct inode *inode, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, - .encrypted_page = NULL, + .op_flags = REQ_SYNC | REQ_PRIO, + .io_type = FS_DATA_IO, }; - bool submit_bio = false; + pgoff_t last_idx = ULONG_MAX; int err = 0; list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { @@ -279,25 +334,31 @@ static int __commit_inmem_pages(struct inode *inode, inode_dec_dirty_pages(inode); remove_dirty_inode(inode); } - +retry: fio.page = page; + fio.old_blkaddr = NULL_ADDR; + fio.encrypted_page = NULL; + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } unlock_page(page); break; } - /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - - submit_bio = true; + last_idx = page->index; } unlock_page(page); list_move_tail(&cur->list, revoke_list); } - if (submit_bio) - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); if (!err) __revoke_inmem_pages(inode, revoke_list, false, false); @@ -352,15 +413,14 @@ int commit_inmem_pages(struct inode *inode) void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif - if (!need) - return; - /* balance_fs_bg is able to be pending */ - if (excess_cached_nats(sbi)) + if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); /* @@ -369,7 +429,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false, false); + f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -386,9 +446,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi, false); + build_free_nids(sbi, false, false); - if (!is_idle(sbi)) + if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ @@ -409,7 +469,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } -static int __submit_flush_wait(struct block_device *bdev) +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) { struct bio *bio = f2fs_bio_alloc(0); int ret; @@ -418,20 +479,24 @@ static int __submit_flush_wait(struct block_device *bdev) bio->bi_bdev = bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); bio_put(bio); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi) { - int ret = __submit_flush_wait(sbi->sb->s_bdev); + int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); int i; - if (sbi->s_ndevs && !ret) { - for (i = 1; i < sbi->s_ndevs; i++) { - ret = __submit_flush_wait(FDEV(i).bdev); - if (ret) - break; - } + if (!sbi->s_ndevs || ret) + return ret; + + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; } return ret; } @@ -445,6 +510,8 @@ repeat: if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -453,6 +520,8 @@ repeat: fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; @@ -461,6 +530,8 @@ repeat: fcc->dispatch_list = NULL; } + sb_end_intwrite(sbi->sb); + wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; @@ -470,36 +541,60 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; - - trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); + int ret; if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - int ret; - - atomic_inc(&fcc->submit_flush); + if (!test_opt(sbi, FLUSH_MERGE)) { ret = submit_flush_wait(sbi); - atomic_dec(&fcc->submit_flush); + atomic_inc(&fcc->issued_flush); + return ret; + } + + if (atomic_inc_return(&fcc->issing_flush) == 1) { + ret = submit_flush_wait(sbi); + atomic_dec(&fcc->issing_flush); + + atomic_inc(&fcc->issued_flush); return ret; } init_completion(&cmd.wait); - atomic_inc(&fcc->submit_flush); llist_add(&cmd.llnode, &fcc->issue_list); - if (!fcc->dispatch_list) + /* update issue_list before we wake up issue_flush thread */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); } else { - llist_del_all(&fcc->issue_list); - atomic_set(&fcc->submit_flush, 0); + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->issing_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } } return cmd.ret; @@ -513,16 +608,22 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; + if (fcc->f2fs_issue_flush) + return err; goto init_thread; } fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->issing_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; + if (!test_opt(sbi, FLUSH_MERGE)) + return err; + init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); @@ -592,8 +693,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) - clear_bit(GET_SECNO(sbi, segno), + if (get_valid_blocks(sbi, segno, true) == 0) + clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); } } @@ -613,7 +714,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == 0) { __locate_dirty_segment(sbi, segno, PRE); @@ -628,61 +729,91 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, - struct bio *bio, block_t lstart, block_t len) +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; struct discard_cmd *dc; + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); - dc->bio = bio; + dc->bdev = bdev; dc->lstart = lstart; + dc->start = start; dc->len = len; + dc->ref = 0; + dc->state = D_PREP; + dc->error = 0; init_completion(&dc->wait); - list_add_tail(&dc->list, wait_list); + list_add_tail(&dc->list, pend_list); + atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; return dc; } -/* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p) { - struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); - struct discard_cmd *dc, *tmp; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; - list_for_each_entry_safe(dc, tmp, wait_list, list) { - struct bio *bio = dc->bio; - int err; + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); - if (!completion_done(&dc->wait)) { - if ((dc->lstart <= blkaddr && - blkaddr < dc->lstart + dc->len) || - blkaddr == NULL_ADDR) - wait_for_completion_io(&dc->wait); - else - continue; - } + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color(&dc->rb_node, &dcc->root); - err = bio->bi_error; - if (err == -EOPNOTSUPP) - err = 0; + return dc; +} - if (err) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + if (dc->state == D_DONE) + atomic_dec(&dcc->issing_discard); - bio_put(bio); - list_del(&dc->list); - kmem_cache_free(discard_cmd_slab, dc); - } + list_del(&dc->list); + rb_erase(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; + + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + f2fs_bug_on(sbi, dc->ref); + + if (dc->error == -EOPNOTSUPP) + dc->error = 0; + + if (dc->error) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard(%u, %u, %u) failed, ret: %d", + dc->lstart, dc->start, dc->len, dc->error); + __detach_discard_cmd(dcc, dc); } static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - complete(&dc->wait); + dc->error = bio->bi_error; + dc->state = D_DONE; + complete_all(&dc->wait); + bio_put(bio); } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -739,12 +870,12 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, } if (bio) { - int ret = submit_bio_wait(0, bio); + int ret = submit_bio_wait(op, bio); bio_put(bio); if (ret) return ret; } - bio = f2fs_bio_alloc(0); + bio = f2fs_bio_alloc(1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); @@ -766,58 +897,471 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } +void __check_sit_bitmap(struct f2fs_sb_info *sbi, + block_t start, block_t end) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct seg_entry *sentry; + unsigned int segno; + block_t blk = start; + unsigned long offset, size, max_blocks = sbi->blocks_per_seg; + unsigned long *map; + + while (blk < end) { + segno = GET_SEGNO(sbi, blk); + sentry = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blk); + + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; + map = (unsigned long *)(sentry->cur_valid_map); + offset = __find_rev_next_bit(map, size, offset); + f2fs_bug_on(sbi, offset != size); + blk = START_BLOCK(sbi, segno + 1); + } +#endif +} + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, +static void __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct bio *bio = NULL; + + if (dc->state != D_PREP) + return; + + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + + dc->error = __blkdev_issue_discard(dc->bdev, + SECTOR_FROM_BLOCK(dc->start), + SECTOR_FROM_BLOCK(dc->len), + GFP_NOFS, 0, &bio); + if (!dc->error) { + /* should keep before submission to avoid D_DONE right away */ + dc->state = D_SUBMIT; + atomic_inc(&dcc->issued_discard); + atomic_inc(&dcc->issing_discard); + if (bio) { + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + submit_bio(REQ_SYNC, bio); + list_move_tail(&dc->list, &dcc->wait_list); + __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); + } + } else { + __remove_discard_cmd(sbi, dc); + } +} + +static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p = &dcc->root.rb_node; + struct rb_node *parent = NULL; + struct discard_cmd *dc = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); +do_insert: + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + if (!dc) + return NULL; + + return dc; +} + +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + dcc->undiscard_blks -= di.len; + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + } else { + dc->lstart++; + dc->len--; + dc->start++; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + } + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + block_t end = lstart + len; + + mutex_lock(&dcc->cmd_lock); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di)) { + prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, prev_dc); + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, next_dc); + if (tdc) + __remove_discard_cmd(sbi, tdc); + merged = true; + } + + if (!merged) { + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + } + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + mutex_unlock(&dcc->cmd_lock); +} + +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - struct bio *bio = NULL; block_t lblkstart = blkstart; - int err; - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + trace_f2fs_queue_discard(bdev, blkstart, blklen); if (sbi->s_ndevs) { int devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } - err = __blkdev_issue_discard(bdev, - SECTOR_FROM_BLOCK(blkstart), - SECTOR_FROM_BLOCK(blklen), - GFP_NOFS, 0, &bio); - if (!err && bio) { - struct discard_cmd *dc = __add_discard_cmd(sbi, bio, - lblkstart, blklen); + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + return 0; +} - bio->bi_private = dc; - bio->bi_end_io = f2fs_submit_discard_endio; - submit_bio(REQ_SYNC, bio); +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int iter = 0, issued = 0; + int i; + bool io_interrupted = false; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + blk_start_plug(&plug); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + /* Hurry up to finish fstrim */ + if (dcc->pend_list_tag[i] & P_TRIM) { + __submit_discard_cmd(sbi, dc); + issued++; + + if (fatal_signal_pending(current)) + break; + continue; + } + + if (!issue_cond) { + __submit_discard_cmd(sbi, dc); + issued++; + continue; + } + + if (is_idle(sbi)) { + __submit_discard_cmd(sbi, dc); + issued++; + } else { + io_interrupted = true; + } + + if (++iter >= DISCARD_ISSUE_RATE) + goto out; + } + if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) + dcc->pend_list_tag[i] &= (~P_TRIM); } - return err; +out: + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->wait_list); + struct discard_cmd *dc, *tmp; + bool need_wait; + +next: + need_wait = false; + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } else { + dc->ref++; + need_wait = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) { + __wait_one_discard_bio(sbi, dc); + goto next; + } +} + +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + bool need_wait = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + if (dc) { + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) + __wait_one_discard_bio(sbi, dc); +} + +void stop_discard_thread(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } +} + +/* This comes from f2fs_put_super and f2fs_trim_fs */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +{ + __issue_discard_cmd(sbi, false); + __drop_discard_cmd(sbi); + __wait_discard_cmd(sbi, false); +} + +static void mark_discard_range_all(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) + dcc->pend_list_tag[i] |= P_TRIM; + mutex_unlock(&dcc->cmd_lock); +} + +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; + unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + int issued; + + set_freezable(); + + do { + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + if (try_to_freeze()) + continue; + if (kthread_should_stop()) + return 0; + + if (dcc->discard_wake) { + dcc->discard_wake = 0; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + mark_discard_range_all(sbi); + } + + sb_start_intwrite(sbi->sb); + + issued = __issue_discard_cmd(sbi, true); + if (issued) { + __wait_discard_cmd(sbi, true); + wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + } else { + wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + } + + sb_end_intwrite(sbi->sb); + + } while (!kthread_should_stop()); + return 0; } #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - sector_t sector; + sector_t sector, nr_sects; + block_t lblkstart = blkstart; int devi = 0; if (sbi->s_ndevs) { devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } - sector = SECTOR_FROM_BLOCK(blkstart); - - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { - f2fs_msg(sbi->sb, KERN_INFO, - "(%d) %s: Unaligned discard attempted (block %x + %x)", - devi, sbi->s_ndevs ? FDEV(devi).path: "", - blkstart, blklen); - return -EIO; - } /* * We need to know the type of the zone: for conventional zones, @@ -829,10 +1373,21 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: - trace_f2fs_issue_reset_zone(sbi->sb, blkstart); + sector = SECTOR_FROM_BLOCK(blkstart); + nr_sects = SECTOR_FROM_BLOCK(blklen); + + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); + return -EIO; + } + trace_f2fs_issue_reset_zone(bdev, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: @@ -850,7 +1405,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -893,31 +1448,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return err; } -static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, struct seg_entry *se, - unsigned int start, unsigned int end) -{ - struct list_head *head = &SM_I(sbi)->discard_entry_list; - struct discard_entry *new, *last; - - if (!list_empty(head)) { - last = list_last_entry(head, struct discard_entry, list); - if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len) { - last->len += end - start; - goto done; - } - } - - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; - new->len = end - start; - list_add_tail(&new->list, head); -done: - SM_I(sbi)->nr_discards += end - start; -} - static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { @@ -929,7 +1459,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) @@ -937,7 +1469,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || - SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) + SM_I(sbi)->dcc_info->nr_discards >= + SM_I(sbi)->dcc_info->max_discards) return false; } @@ -946,7 +1479,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; - while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + while (force || SM_I(sbi)->dcc_info->nr_discards <= + SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; @@ -959,14 +1493,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (check_only) return true; - __add_discard_entry(sbi, cpc, se, start, end); + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -992,16 +1536,14 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_entry_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; - bool force = (cpc->reason == CP_DISCARD); - - blk_start_plug(&plug); + bool force = (cpc->reason & CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -1031,32 +1573,117 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) continue; } next: - secno = GET_SECNO(sbi, start); - start_segno = secno * sbi->segs_per_sec; + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && - !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), sbi->segs_per_sec << sbi->log_blocks_per_seg); start = start_segno + sbi->segs_per_sec; if (start < end) goto next; + else + end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (force && entry->len < cpc->trim_minlen) - goto skip; - f2fs_issue_discard(sbi, entry->blkaddr, entry->len); - cpc->trimmed += entry->len; + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (f2fs_sb_mounted_blkzoned(sbi->sb) || + (force && len < cpc->trim_minlen)) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + cpc->trimmed += len; + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + list_del(&entry->list); - SM_I(sbi)->nr_discards -= entry->len; + dcc->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } - blk_finish_plug(&plug); + wake_up_discard_thread(sbi, false); +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc; + int err = 0, i; + + if (SM_I(sbi)->dcc_info) { + dcc = SM_I(sbi)->dcc_info; + goto init_thread; + } + + dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL); + if (!dcc) + return -ENOMEM; + + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + INIT_LIST_HEAD(&dcc->entry_list); + for (i = 0; i < MAX_PLIST_NUM; i++) { + INIT_LIST_HEAD(&dcc->pend_list[i]); + if (i >= dcc->discard_granularity - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + } + INIT_LIST_HEAD(&dcc->wait_list); + mutex_init(&dcc->cmd_lock); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); + dcc->nr_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->undiscard_blks = 0; + dcc->root = RB_ROOT; + + init_waitqueue_head(&dcc->discard_wait_queue); + SM_I(sbi)->dcc_info = dcc; +init_thread: + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) { + err = PTR_ERR(dcc->f2fs_issue_discard); + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + return err; + } + + return err; +} + +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (!dcc) + return; + + stop_discard_thread(sbi); + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -1085,6 +1712,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif segno = GET_SEGNO(sbi, blkaddr); @@ -1101,32 +1732,54 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_and_set_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); -#endif } +#endif + if (unlikely(exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; + } + if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; - } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { -#ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_test_and_clear_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else - f2fs_bug_on(sbi, 1); -#endif + + /* don't overwrite by SSR to keep node chain */ + if (se->type == CURSEG_WARM_NODE) { + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks++; } + } else { + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); + f2fs_bug_on(sbi, 1); + } +#endif + if (unlikely(!exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly cleared, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; + } + if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -1312,8 +1965,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; - unsigned int hint = *newseg / sbi->segs_per_sec; - unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); unsigned int left_start = hint; bool init = true; int go_left = 0; @@ -1323,8 +1976,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - (hint + 1) * sbi->segs_per_sec, *newseg + 1); - if (segno < (hint + 1) * sbi->segs_per_sec) + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } find_other_zone: @@ -1355,8 +2008,8 @@ find_other_zone: secno = left_start; skip_left: hint = secno; - segno = secno * sbi->segs_per_sec; - zoneno = secno / sbi->secs_per_zone; + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) @@ -1400,7 +2053,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct summary_footer *sum_footer; curseg->segno = curseg->next_segno; - curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; @@ -1413,6 +2066,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) __set_sit_entry_type(sbi, type, curseg->segno, modified); } +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (sbi->segs_per_sec != 1) + return CURSEG_I(sbi, type)->segno; + + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + return 0; + + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + return CURSEG_I(sbi, type)->segno; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -1431,6 +2098,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) if (test_opt(sbi, NOHEAP)) dir = ALLOC_RIGHT; + segno = __get_next_segno(sbi, type); get_new_segment(sbi, &segno, new_sec, dir); curseg->next_segno = segno; reset_curseg(sbi, type, 1); @@ -1473,7 +2141,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1494,28 +2162,53 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - if (reuse) { - sum_page = get_sum_page(sbi, new_segno); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; + int i, cnt; + bool reversed = false; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) - return v_ops->get_victim(sbi, - &(curseg)->next_segno, BG_GC, type, SSR); + /* need_SSR() already forces to do this */ + if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + curseg->next_segno = segno; + return 1; + } - /* For data segments, let's do SSR more intensively */ - for (; type >= CURSEG_HOT_DATA; type--) - if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, type, SSR)) + /* For node segments, let's do SSR more intensively */ + if (IS_NODESEG(type)) { + if (type >= CURSEG_WARM_NODE) { + reversed = true; + i = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_NODE; + } + cnt = NR_CURSEG_NODE_TYPE; + } else { + if (type >= CURSEG_WARM_DATA) { + reversed = true; + i = CURSEG_COLD_DATA; + } else { + i = CURSEG_HOT_DATA; + } + cnt = NR_CURSEG_DATA_TYPE; + } + + for (; cnt-- > 0; reversed ? i-- : i++) { + if (i == type) + continue; + if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + curseg->next_segno = segno; return 1; + } + } return 0; } @@ -1530,12 +2223,13 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, if (force) new_curseg(sbi, type, true); - else if (type == CURSEG_WARM_NODE) + else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -1563,14 +2257,19 @@ static const struct segment_allocation default_salloc_ops = { bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; + bool has_candidate = false; mutex_lock(&SIT_I(sbi)->sentry_lock); - for (; trim_start <= cpc->trim_end; trim_start++) - if (add_discard_addrs(sbi, cpc, true)) + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { + if (add_discard_addrs(sbi, cpc, true)) { + has_candidate = true; break; + } + } mutex_unlock(&SIT_I(sbi)->sentry_lock); - return trim_start <= cpc->trim_end; + cpc->trim_start = trim_start; + return has_candidate; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) @@ -1623,6 +2322,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } + /* It's time to issue all the filed discards */ + mark_discard_range_all(sbi); + f2fs_wait_discard_bios(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -1636,68 +2338,80 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -static int __get_segment_type_2(struct page *page, enum page_type p_type) +static int __get_segment_type_2(struct f2fs_io_info *fio) { - if (p_type == DATA) + if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } -static int __get_segment_type_4(struct page *page, enum page_type p_type) +static int __get_segment_type_4(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(page) && is_cold_node(page)) + if (IS_DNODE(fio->page) && is_cold_node(fio->page)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } -static int __get_segment_type_6(struct page *page, enum page_type p_type) +static int __get_segment_type_6(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; - if (S_ISDIR(inode->i_mode)) - return CURSEG_HOT_DATA; - else if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - else - return CURSEG_WARM_DATA; + if (is_inode_flag_set(inode, FI_HOT_DATA)) + return CURSEG_HOT_DATA; + return CURSEG_WARM_DATA; } else { - if (IS_DNODE(page)) - return is_cold_node(page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->page)) + return is_cold_node(fio->page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; - else - return CURSEG_COLD_NODE; + return CURSEG_COLD_NODE; } } -static int __get_segment_type(struct page *page, enum page_type p_type) +static int __get_segment_type(struct f2fs_io_info *fio) { - switch (F2FS_P_SB(page)->active_logs) { + int type = 0; + + switch (fio->sbi->active_logs) { case 2: - return __get_segment_type_2(page, p_type); + type = __get_segment_type_2(fio); + break; case 4: - return __get_segment_type_4(page, p_type); + type = __get_segment_type_4(fio); + break; + case 6: + type = __get_segment_type_6(fio); + break; + default: + f2fs_bug_on(fio->sbi, true); } - /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(F2FS_P_SB(page), - F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE); - return __get_segment_type_6(page, p_type); + + if (IS_HOT(type)) + fio->temp = HOT; + else if (IS_WARM(type)) + fio->temp = WARM; + else + fio->temp = COLD; + return type; } void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, int type) + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1723,42 +2437,52 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (!__has_curseg_space(sbi, type)) sit_i->s_ops->allocate_segment(sbi, type, false); /* - * SIT information should be updated before segment allocation, - * since SSR needs latest valid block information. + * SIT information should be updated after segment allocation, + * since we need to keep dirty segments precisely under SSR. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); mutex_unlock(&sit_i->sentry_lock); - if (page && IS_NODESEG(type)) + if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + f2fs_inode_chksum_set(sbi, page); + } + + if (add_list) { + struct f2fs_bio_info *io; + + INIT_LIST_HEAD(&fio->list); + fio->in_list = true; + io = sbi->write_io[fio->type] + fio->temp; + spin_lock(&io->io_lock); + list_add_tail(&fio->list, &io->io_list); + spin_unlock(&io->io_lock); + } + mutex_unlock(&curseg->curseg_mutex); } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(fio->page, fio->type); + int type = __get_segment_type(fio); int err; - if (fio->type == NODE || fio->type == DATA) - mutex_lock(&fio->sbi->wio_mutex[fio->type]); reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type); + &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ - err = f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_write(fio); if (err == -EAGAIN) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; } - - if (fio->type == NODE || fio->type == DATA) - mutex_unlock(&fio->sbi->wio_mutex[fio->type]); } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, @@ -1769,13 +2493,16 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, + .in_list = false, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; set_page_writeback(page); - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); + + f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -1784,6 +2511,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) @@ -1797,13 +2526,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } -void rewrite_data_page(struct f2fs_io_info *fio) +int rewrite_data_page(struct f2fs_io_info *fio) { + int err; + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - f2fs_submit_page_mbio(fio); + + err = f2fs_submit_page_bio(fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + + return err; } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -1845,7 +2583,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -1864,7 +2602,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; } @@ -1894,7 +2632,8 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, type); if (ordered) wait_on_page_writeback(page); else @@ -1902,8 +2641,7 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr) +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; @@ -2052,6 +2790,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; @@ -2078,6 +2818,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) + return -EINVAL; + return 0; } @@ -2367,7 +3112,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (cpc->reason != CP_DISCARD) { + if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } @@ -2403,7 +3148,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) @@ -2431,13 +3176,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -2470,7 +3215,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -2521,12 +3266,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -2602,10 +3347,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) /* build discard map only one time */ if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2629,10 +3381,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2676,7 +3433,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) continue; if (valid_blocks > sbi->blocks_per_seg) { @@ -2694,7 +3451,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -2716,7 +3473,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -2782,22 +3539,22 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - - INIT_LIST_HEAD(&sm_info->discard_entry_list); - INIT_LIST_HEAD(&sm_info->discard_cmd_list); - sm_info->nr_discards = 0; - sm_info->max_discards = 0; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; INIT_LIST_HEAD(&sm_info->sit_entry_set); - if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) return err; } + err = create_discard_cmd_control(sbi); + if (err) + return err; + err = build_sit_info(sbi); if (err) return err; @@ -2919,6 +3676,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5cb5755c75d9..ffa11274b0ce 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -21,78 +21,88 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ /* L: Logical segment # in volume, R: Relative segment # in main area */ -#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) -#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) + +#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) +#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) +#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) #define IS_CURSEG(sbi, seg) \ - ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ - ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - sbi->segs_per_sec)) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ #define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) #define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) -#define MAIN_SECS(sbi) (sbi->total_sections) +#define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) -#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) -#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ - sbi->log_blocks_per_seg)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ - (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define GET_SECNO(sbi, segno) \ - ((segno) / sbi->segs_per_sec) -#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define GET_SEC_FROM_SEG(sbi, segno) \ + ((segno) / (sbi)->segs_per_sec) +#define GET_SEG_FROM_SEC(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + ((secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ - ((sbi->sm_info->ssa_blkaddr) + segno) + ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) -#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ - (segno % sit_i->sents_per_block) + ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ - (segno / SIT_ENTRY_PER_BLOCK) + ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ @@ -103,7 +113,7 @@ #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ - (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -132,7 +142,10 @@ enum { */ enum { GC_CB = 0, - GC_GREEDY + GC_GREEDY, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, }; /* @@ -227,6 +240,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { @@ -303,17 +318,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno, int section) + unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (section > 1) + if (use_section && sbi->segs_per_sec > 1) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; @@ -358,8 +373,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -379,7 +394,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) @@ -390,8 +406,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -412,7 +428,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; @@ -475,27 +492,9 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) return SM_I(sbi)->ovp_segments; } -static inline int overprovision_sections(struct f2fs_sb_info *sbi) -{ - return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; -} - static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; -} - -static inline bool need_SSR(struct f2fs_sb_info *sbi) -{ - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - - if (test_opt(sbi, LFS)) - return false; - - return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + 1); + return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, @@ -540,6 +539,7 @@ static inline int utilization(struct f2fs_sb_info *sbi) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 enum { F2FS_IPU_FORCE, @@ -547,20 +547,22 @@ enum { F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode) +static inline bool need_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; - /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (test_opt(sbi, LFS)) return false; + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -572,6 +574,15 @@ static inline bool need_inplace_update(struct inode *inode) utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && is_inode_flag_set(inode, FI_NEED_IPU)) @@ -716,6 +727,15 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } +static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, + unsigned int secno) +{ + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= + sbi->fggc_threshold) + return true; + return false; +} + static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) @@ -727,8 +747,8 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. * By default, 512 pages for directory data, - * 512 pages (2MB) * 3 for three types of nodes, and - * max_bio_blocks for meta are set. + * 512 pages (2MB) * 8 for nodes, and + * 256 pages * 8 for meta are set. */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { @@ -764,3 +784,28 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, wbc->nr_to_write = desired; return desired - nr_to_write; } + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 84d5686c4aa4..315e59ad1483 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -22,8 +22,10 @@ #include #include #include +#include #include #include +#include #include "f2fs.h" #include "node.h" @@ -35,9 +37,7 @@ #define CREATE_TRACE_POINTS #include -static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; -static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", }; @@ -82,6 +83,7 @@ enum { Opt_discard, Opt_nodiscard, Opt_noheap, + Opt_heap, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, @@ -89,6 +91,7 @@ enum { Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, + Opt_noinline_xattr, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -105,6 +108,20 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_quota, + Opt_noquota, + Opt_usrquota, + Opt_grpquota, + Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, Opt_err, }; @@ -115,6 +132,7 @@ static match_table_t f2fs_tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, @@ -122,6 +140,7 @@ static match_table_t f2fs_tokens = { {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, + {Opt_noinline_xattr, "noinline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -138,207 +157,23 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_err, NULL}, }; -/* Sysfs support for f2fs */ -enum { - GC_THREAD, /* struct f2fs_gc_thread */ - SM_INFO, /* struct f2fs_sm_info */ - NM_INFO, /* struct f2fs_nm_info */ - F2FS_SBI, /* struct f2fs_sb_info */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - FAULT_INFO_RATE, /* struct f2fs_fault_info */ - FAULT_INFO_TYPE, /* struct f2fs_fault_info */ -#endif -}; - -struct f2fs_attr { - struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); - int struct_type; - int offset; -}; - -static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) -{ - if (struct_type == GC_THREAD) - return (unsigned char *)sbi->gc_thread; - else if (struct_type == SM_INFO) - return (unsigned char *)SM_I(sbi); - else if (struct_type == NM_INFO) - return (unsigned char *)NM_I(sbi); - else if (struct_type == F2FS_SBI) - return (unsigned char *)sbi; -#ifdef CONFIG_F2FS_FAULT_INJECTION - else if (struct_type == FAULT_INFO_RATE || - struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; -#endif - return NULL; -} - -static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->kbytes_written + - BD_PART_WRITTEN(sbi))); -} - -static ssize_t f2fs_sbi_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - unsigned char *ptr = NULL; - unsigned int *ui; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t f2fs_sbi_store(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned char *ptr; - unsigned long t; - unsigned int *ui; - ssize_t ret; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret < 0) - return ret; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) - return -EINVAL; -#endif - *ui = t; - return count; -} - -static ssize_t f2fs_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; -} - -static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->store ? a->store(a, sbi, buf, len) : 0; -} - -static void f2fs_sb_release(struct kobject *kobj) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} - -#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .struct_type = _struct_type, \ - .offset = _offset \ -} - -#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ - F2FS_ATTR_OFFSET(struct_type, name, 0644, \ - f2fs_sbi_show, f2fs_sbi_store, \ - offsetof(struct struct_name, elname)) - -#define F2FS_GENERAL_RO_ATTR(name) \ -static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) - -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); -#ifdef CONFIG_F2FS_FAULT_INJECTION -F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); -F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); -#endif -F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); - -#define ATTR_LIST(name) (&f2fs_attr_##name.attr) -static struct attribute *f2fs_attrs[] = { - ATTR_LIST(gc_min_sleep_time), - ATTR_LIST(gc_max_sleep_time), - ATTR_LIST(gc_no_gc_sleep_time), - ATTR_LIST(gc_idle), - ATTR_LIST(reclaim_segments), - ATTR_LIST(max_small_discards), - ATTR_LIST(batched_trim_sections), - ATTR_LIST(ipu_policy), - ATTR_LIST(min_ipu_util), - ATTR_LIST(min_fsync_blocks), - ATTR_LIST(max_victim_search), - ATTR_LIST(dir_level), - ATTR_LIST(ram_thresh), - ATTR_LIST(ra_nid_pages), - ATTR_LIST(dirty_nats_ratio), - ATTR_LIST(cp_interval), - ATTR_LIST(idle_interval), -#ifdef CONFIG_F2FS_FAULT_INJECTION - ATTR_LIST(inject_rate), - ATTR_LIST(inject_type), -#endif - ATTR_LIST(lifetime_write_kbytes), - NULL, -}; - -static const struct sysfs_ops f2fs_attr_ops = { - .show = f2fs_attr_show, - .store = f2fs_attr_store, -}; - -static struct kobj_type f2fs_ktype = { - .default_attrs = f2fs_attrs, - .sysfs_ops = &f2fs_attr_ops, - .release = f2fs_sb_release, -}; - void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -347,7 +182,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } @@ -358,6 +193,104 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; + + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -EINVAL; + } + qname = match_strdup(args); + if (!qname) { + f2fs_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -EINVAL; + } + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; +} + +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -EINVAL; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; +} + +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +{ + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -1; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || + sbi->s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_msg(sbi->sb, KERN_ERR, "old and new quota " + "format mixing"); + return -1; + } + + if (!sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " + "not specified"); + return -1; + } + } + return 0; +} +#endif + static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -365,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; +#ifdef CONFIG_QUOTA + int ret; +#endif if (!options) return 0; @@ -431,6 +367,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noheap: set_opt(sbi, NOHEAP); break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: set_opt(sbi, XATTR_USER); @@ -441,6 +380,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_xattr: set_opt(sbi, INLINE_XATTR); break; + case Opt_noinline_xattr: + clear_opt(sbi, INLINE_XATTR); + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -454,6 +396,10 @@ static int parse_options(struct super_block *sb, char *options) f2fs_msg(sb, KERN_INFO, "inline_xattr options not supported"); break; + case Opt_noinline_xattr: + f2fs_msg(sb, KERN_INFO, + "noinline_xattr options not supported"); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL case Opt_acl: @@ -553,6 +499,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, arg); + set_opt(sbi, FAULT_INJECTION); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); @@ -564,6 +511,81 @@ static int parse_options(struct super_block *sb, char *options) case Opt_nolazytime: sb->s_flags &= ~MS_LAZYTIME; break; +#ifdef CONFIG_QUOTA + case Opt_quota: + case Opt_usrquota: + set_opt(sbi, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi, GRPQUOTA); + break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + sbi->s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: + f2fs_msg(sb, KERN_INFO, + "quota operations not supported"); + break; +#endif default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -571,6 +593,10 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { f2fs_msg(sb, KERN_ERR, @@ -603,14 +629,22 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); + init_rwsem(&fi->i_mmap_sem); + init_rwsem(&fi->i_xattr_sem); +#ifdef CONFIG_QUOTA + memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); + fi->i_reserved_quota = 0; +#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } static int f2fs_drop_inode(struct inode *inode) { + int ret; /* * This is to avoid a deadlock condition like below. * writeback_single_inode(inode) @@ -643,10 +677,12 @@ static int f2fs_drop_inode(struct inode *inode) spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } + trace_f2fs_drop_inode(inode, 0); return 0; } - - return generic_drop_inode(inode); + ret = generic_drop_inode(inode); + trace_f2fs_drop_inode(inode, ret); + return ret; } int f2fs_inode_dirtied(struct inode *inode, bool sync) @@ -744,15 +780,9 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - kobject_del(&sbi->s_kobj); - - stop_gc_thread(sbi); + f2fs_quota_off_umount(sb); /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); @@ -771,7 +801,14 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bio(sbi, NULL_ADDR); + f2fs_wait_discard_bios(sbi); + + if (f2fs_discard_en(sbi) && !sbi->discard_blks) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + write_checkpoint(sbi, &cpc); + } /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); @@ -786,7 +823,7 @@ static void f2fs_put_super(struct super_block *sb) mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -796,8 +833,8 @@ static void f2fs_put_super(struct super_block *sb) destroy_segment_manager(sbi); kfree(sbi->ckpt); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + + f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -805,8 +842,15 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->raw_super); destroy_device_list(sbi); - + if (sbi->write_io_dummy) + mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif destroy_percpu_info(sbi); + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); kfree(sbi); } @@ -817,6 +861,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + if (sync) { struct cp_control cpc; @@ -851,12 +898,55 @@ static int f2fs_unfreeze(struct super_block *sb) return 0; } +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count, ovp_count; + u64 avail_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; @@ -867,19 +957,67 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi); + buf->f_bavail = user_block_count - valid_user_blocks(sbi) - + sbi->reserved_blocks; - buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = min(buf->f_files - valid_node_count(sbi), - buf->f_bavail); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + + if (avail_node_count > user_block_count) { + buf->f_files = user_block_count; + buf->f_ffree = buf->f_bavail; + } else { + buf->f_files = avail_node_count; + buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_bavail); + } buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif return 0; } +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); +#endif +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -897,7 +1035,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap_alloc"); + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -905,6 +1045,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",nouser_xattr"); if (test_opt(sbi, INLINE_XATTR)) seq_puts(seq, ",inline_xattr"); + else + seq_puts(seq, ",noinline_xattr"); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -943,87 +1085,37 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",active_logs=%u", sbi->active_logs); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) + seq_printf(seq, ",fault_injection=%u", + sbi->fault_info.inject_rate); +#endif +#ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); + if (test_opt(sbi, USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (test_opt(sbi, GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); +#endif + f2fs_show_quota_options(seq, sbi->sb); return 0; } -static int segment_info_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i; - - seq_puts(seq, "format: segment_type|valid_blocks\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - if ((i % 10) == 0) - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, 1)); - if ((i % 10) == 9 || i == (total_segs - 1)) - seq_putc(seq, '\n'); - else - seq_putc(seq, ' '); - } - - return 0; -} - -static int segment_bits_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i, j; - - seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, 1)); - for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) - seq_printf(seq, " %.2x", se->cur_valid_map[j]); - seq_putc(seq, '\n'); - } - return 0; -} - -#define F2FS_PROC_FILE_DEF(_name) \ -static int _name##_open_fs(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ -} \ - \ -static const struct file_operations f2fs_seq_##_name##_fops = { \ - .owner = THIS_MODULE, \ - .open = _name##_open_fs, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -F2FS_PROC_FILE_DEF(segment_info); -F2FS_PROC_FILE_DEF(segment_bits); - static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_mounted_blkzoned(sbi->sb)) { @@ -1049,6 +1141,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; + unsigned long old_sb_flags; int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; @@ -1056,14 +1149,37 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; + int i, j; +#endif /* * Save the old mount options in case we * need to restore them. */ org_mount_opt = sbi->mount_opt; + old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; +#ifdef CONFIG_QUOTA + s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(s_qf_names[j]); + return -ENOMEM; + } + } else { + s_qf_names[i] = NULL; + } + } +#endif + /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); @@ -1073,7 +1189,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - sbi->mount_opt.opt = 0; default_options(sbi); /* parse mount options */ @@ -1088,6 +1203,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + } else { + /* dquot_resume needs RW */ + sb->s_flags &= ~MS_RDONLY; + dquot_resume(sb, -1); + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -1136,6 +1261,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_gc; } skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(s_qf_names[i]); +#endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -1150,21 +1280,289 @@ restore_gc: stop_gc_thread(sbi); } restore_opts: +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = s_qf_names[i]; + } +#endif sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; + sb->s_flags = old_sb_flags; #ifdef CONFIG_F2FS_FAULT_INJECTION sbi->fault_info = ffi; #endif return err; } -static struct super_operations f2fs_sops = { +#ifdef CONFIG_QUOTA +/* Read data from quotafile */ +static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + block_t blkidx = F2FS_BYTES_TO_BLK(off); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + loff_t i_size = i_size_read(inode); + struct page *page; + char *kaddr; + + if (off > i_size) + return 0; + + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); +repeat: + page = read_mapping_page(mapping, blkidx, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return -EIO; + } + + kaddr = kmap_atomic(page); + memcpy(data, kaddr + offset, tocopy); + kunmap_atomic(kaddr); + f2fs_put_page(page, 1); + + offset = 0; + toread -= tocopy; + data += tocopy; + blkidx++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t f2fs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + int offset = off & (sb->s_blocksize - 1); + size_t towrite = len; + struct page *page; + char *kaddr; + int err = 0; + int tocopy; + + while (towrite > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, + towrite); + + err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, + &page, NULL); + if (unlikely(err)) + break; + + kaddr = kmap_atomic(page); + memcpy(kaddr + offset, data, tocopy); + kunmap_atomic(kaddr); + flush_dcache_page(page); + + a_ops->write_end(NULL, mapping, off, tocopy, tocopy, + page, NULL); + offset = 0; + towrite -= tocopy; + off += tocopy; + data += tocopy; + cond_resched(); + } + + if (len == towrite) + return 0; + inode->i_version++; + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return len - towrite; +} + +static struct dquot **f2fs_get_dquots(struct inode *inode) +{ + return F2FS_I(inode)->i_dquot; +} + +static qsize_t *f2fs_get_reserved_space(struct inode *inode) +{ + return &F2FS_I(inode)->i_reserved_quota; +} + +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], + sbi->s_jquota_fmt, type); +} + +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + ret = f2fs_quota_on_mount(sbi, i); + if (ret < 0) + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +} + +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret; + + ret = dquot_writeback_dquots(sb, type); + if (ret) + return ret; + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + + ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping); + if (ret) + return ret; + + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); + } + return 0; +} + +static int f2fs_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + struct inode *inode; + int err; + + err = f2fs_quota_sync(sb, type); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + + inode_lock(inode); + F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); + + return 0; +} + +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + return dquot_quota_off(sb, type); + + f2fs_quota_sync(sb, type); + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out_put: + iput(inode); + return err; +} + +void f2fs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + f2fs_quota_off(sb, type); +} + +#if 0 +int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} +#endif + +static const struct dquot_operations f2fs_quota_operations = { + .get_reserved_space = f2fs_get_reserved_space, + .write_dquot = dquot_commit, + .acquire_dquot = dquot_acquire, + .release_dquot = dquot_release, + .mark_dirty = dquot_mark_dquot_dirty, + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +#if 0 + .get_projid = f2fs_get_projid, + .get_next_id = dquot_get_next_id, +#endif +}; + +static const struct quotactl_ops f2fs_quotactl_ops = { + .quota_on = f2fs_quota_on, + .quota_off = f2fs_quota_off, + .quota_sync = f2fs_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, +}; +#else +void f2fs_quota_off_umount(struct super_block *sb) +{ +} +#endif + +static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = f2fs_quota_read, + .quota_write = f2fs_quota_write, + .get_dquots = f2fs_get_dquots, +#endif .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, @@ -1182,12 +1580,6 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) ctx, len, NULL); } -static int f2fs_key_prefix(struct inode *inode, u8 **key) -{ - *key = F2FS_I_SB(inode)->key_prefix; - return F2FS_I_SB(inode)->key_prefix_size; -} - static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { @@ -1202,16 +1594,16 @@ static unsigned f2fs_max_namelen(struct inode *inode) inode->i_sb->s_blocksize : F2FS_NAME_LEN; } -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { + .key_prefix = "f2fs:", .get_context = f2fs_get_context, - .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; #else -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { .is_encrypted = f2fs_encrypted_inode, }; #endif @@ -1263,9 +1655,16 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_blocks(void) { - loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); + loff_t result = 0; loff_t leaf_count = ADDRS_PER_BLOCK; + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + /* two direct node blocks */ result += (leaf_count * 2); @@ -1467,6 +1866,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + le32_to_cpu(raw_super->segment_count)); + return 1; + } + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; @@ -1480,6 +1886,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int ovp_segments, reserved_segments; + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1501,6 +1909,20 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; @@ -1511,7 +1933,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; + int i, j; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1539,17 +1961,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + atomic_set(&sbi->wb_sync_req, 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - mutex_init(&sbi->wio_mutex[NODE]); - mutex_init(&sbi->wio_mutex[DATA]); + for (i = 0; i < NR_PAGE_TYPE - 1; i++) + for (j = HOT; j < NR_TEMP_TYPE; j++) + mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; -#endif } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -1579,16 +1998,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_size(bdev) - 1)) + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); @@ -1724,36 +2143,59 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned int max_devices = MAX_DEVICES; int i; - for (i = 0; i < MAX_DEVICES; i++) { - if (!RDEV(i).path[0]) + /* Initialize single device information */ + if (!RDEV(0).path[0]) { +#ifdef CONFIG_BLK_DEV_ZONED + if (!bdev_is_zoned(sbi->sb->s_bdev)) return 0; + max_devices = 1; +#else + return 0; +#endif + } - if (i == 0) { - sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * - MAX_DEVICES, GFP_KERNEL); - if (!sbi->devs) - return -ENOMEM; - } + /* + * Initialize multiple devices information, or single + * zoned block device information. + */ + sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info), + GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; - memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); - FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); - if (i == 0) { - FDEV(i).start_blk = 0; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1 + - le32_to_cpu(raw_super->segment0_blkaddr); - } else { - FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1; - } + for (i = 0; i < max_devices; i++) { - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + if (i > 0 && !RDEV(i).path[0]) + break; + + if (max_devices == 1) { + /* Single zoned block device mount */ + FDEV(0).bdev = + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, sbi->sb->s_mode, sbi->sb->s_type); + } else { + /* Multi-device mount */ + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = + le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); @@ -1773,6 +2215,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) "Failed to initialize F2FS blkzone information"); return -EINVAL; } + if (max_devices == 1) + break; f2fs_msg(sbi->sb, KERN_INFO, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", i, FDEV(i).path, @@ -1841,6 +2285,11 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sb)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + /* * The BLKZONED feature indicates that the drive was formatted with * zone alignment optimization. This is optional for host-aware @@ -1850,6 +2299,7 @@ try_onemore: if (f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); + err = -EOPNOTSUPP; goto free_sb_buf; } #endif @@ -1871,6 +2321,12 @@ try_onemore: sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); +#ifdef CONFIG_QUOTA + sb->dq_op = &f2fs_quota_operations; + sb->s_qcop = &f2fs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; +#endif + sb->s_op = &f2fs_sops; sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; @@ -1886,18 +2342,34 @@ try_onemore: mutex_init(&sbi->gc_mutex); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); + init_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->read_io.io_rwsem); - sbi->read_io.sbi = sbi; - sbi->read_io.bio = NULL; + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + for (i = 0; i < NR_PAGE_TYPE; i++) { - init_rwsem(&sbi->write_io[i].io_rwsem); - sbi->write_io[i].sbi = sbi; - sbi->write_io[i].bio = NULL; + int n = (i == META) ? 1: NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), + GFP_KERNEL); + if (!sbi->write_io[i]) { + err = -ENOMEM; + goto free_options; + } + + for (j = HOT; j < n; j++) { + init_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + } } init_rwsem(&sbi->cp_rwsem); @@ -1910,9 +2382,11 @@ try_onemore: if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = - mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0); - if (!sbi->write_io_dummy) + mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); + if (!sbi->write_io_dummy) { + err = -ENOMEM; goto free_options; + } } /* get an inode for meta space */ @@ -1944,6 +2418,7 @@ try_onemore: sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); @@ -1991,10 +2466,9 @@ try_onemore: f2fs_join_shrinker(sbi); - /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); + err = f2fs_build_stats(sbi); if (err) - goto free_node_inode; + goto free_nm; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); @@ -2015,26 +2489,14 @@ try_onemore: goto free_root_inode; } - err = f2fs_build_stats(sbi); + err = f2fs_register_sysfs(sbi); if (err) goto free_root_inode; - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - - if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); + /* if there are nt orphan nodes free them */ + err = recover_orphan_inodes(sbi); if (err) - goto free_proc; + goto free_sysfs; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -2045,7 +2507,7 @@ try_onemore: if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_kobj; + goto free_meta; } if (need_fsck) @@ -2059,7 +2521,7 @@ try_onemore: need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_kobj; + goto free_meta; } } else { err = recover_fsync_data(sbi, true); @@ -2068,7 +2530,7 @@ try_onemore: err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_kobj; + goto free_sysfs; } } skip_recovery: @@ -2083,7 +2545,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_kobj; + goto free_meta; } kfree(options); @@ -2095,22 +2557,23 @@ skip_recovery: sbi->valid_super_block ? 1 : 2, err); } + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", + cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); return 0; -free_kobj: +free_meta: f2fs_sync_inode_meta(sbi); - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); -free_proc: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - f2fs_destroy_stats(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); +free_sysfs: + f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2119,15 +2582,9 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); - /* - * Some dirty meta pages can be produced by recover_orphan_inodes() - * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). - */ - truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); + f2fs_destroy_stats(sbi); free_nm: destroy_node_manager(sbi); free_sm: @@ -2141,7 +2598,13 @@ free_meta_inode: free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_options: + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); destroy_percpu_info(sbi); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif kfree(options); free_sb_buf: kfree(raw_super); @@ -2167,8 +2630,11 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { - if (sb->s_root) + if (sb->s_root) { set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); + stop_gc_thread(F2FS_SB(sb)); + stop_discard_thread(F2FS_SB(sb)); + } kill_block_super(sb); } @@ -2222,30 +2688,26 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) { - err = -ENOMEM; + err = f2fs_init_sysfs(); + if (err) goto free_extent_cache; - } err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_kset; - + goto free_sysfs; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; err = f2fs_create_root_stats(); if (err) goto free_filesystem; - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_kset: - kset_unregister(f2fs_kset); +free_sysfs: + f2fs_exit_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2262,11 +2724,10 @@ fail: static void __exit exit_f2fs_fs(void) { - remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - kset_unregister(f2fs_kset); + f2fs_exit_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c new file mode 100644 index 000000000000..e2c258f717cd --- /dev/null +++ b/fs/f2fs/sysfs.c @@ -0,0 +1,556 @@ +/* + * f2fs sysfs interface + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2017 Chao Yu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include + +#include "f2fs.h" +#include "segment.h" +#include "gc.h" + +static struct proc_dir_entry *f2fs_proc_root; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif + RESERVED_BLOCKS, +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; + int id; +}; + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) + return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif + return NULL; +} + +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + int len = 0; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + if (f2fs_sb_has_crypto(sb)) + len += snprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_mounted_blkzoned(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif + if (a->struct_type == RESERVED_BLOCKS) { + spin_lock(&sbi->stat_lock); + if ((unsigned long)sbi->total_valid_block_count + t > + (unsigned long)sbi->user_block_count) { + spin_unlock(&sbi->stat_lock); + return -EINVAL; + } + *ui = t; + spin_unlock(&sbi->stat_lock); + return count; + } + + if (!strcmp(a->attr.name, "discard_granularity")) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (t == *ui) + return count; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + if (i >= t - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + else + dcc->pend_list_tag[i] &= (~P_ACTIVE); + } + mutex_unlock(&dcc->cmd_lock); + + *ui = t; + return count; + } + + *ui = t; + + if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) + f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +enum feat_id { + FEAT_CRYPTO = 0, + FEAT_BLKZONED, + FEAT_ATOMIC_WRITE, + FEAT_EXTRA_ATTR, + FEAT_PROJECT_QUOTA, + FEAT_INODE_CHECKSUM, +}; + +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (a->id) { + case FEAT_CRYPTO: + case FEAT_BLKZONED: + case FEAT_ATOMIC_WRITE: + case FEAT_EXTRA_ATTR: + case FEAT_PROJECT_QUOTA: + case FEAT_INODE_CHECKSUM: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + return 0; +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + +#define F2FS_FEATURE_RO_ATTR(_name, _id) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ + .id = _id, \ +} + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +#endif +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); +F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); +F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); +F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(discard_granularity), + ATTR_LIST(batched_trim_sections), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), + ATTR_LIST(iostat_enable), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), + ATTR_LIST(reserved_blocks), + NULL, +}; + +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_F2FS_FS_ENCRYPTION + ATTR_LIST(encryption), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_sb_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_attrs = f2fs_feat_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, false)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int segment_bits_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, false)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; +} + +static int iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app IOs */ + seq_printf(seq, "app buffered: %-16llu\n", + sbi->write_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->write_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->write_iostat[APP_MAPPED_IO]); + + /* print fs IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->write_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->write_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->write_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->write_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->write_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->write_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->write_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->write_iostat[FS_CP_META_IO]); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->write_iostat[FS_DISCARD]); + + return 0; +} + +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); +F2FS_PROC_FILE_DEF(iostat_info); + +int __init f2fs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&f2fs_kset); + else + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return ret; +} + +void f2fs_exit_sysfs(void) +{ + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; +} + +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_iostat_info_fops, sb); + } + return 0; +} + +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) +{ + if (sbi->s_proc) { + remove_proc_entry("iostat_info", sbi->s_proc); + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); + } + kobject_del(&sbi->s_kobj); +} diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 73b4e1d1912a..bccbbf2616d2 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page) pid_t pid = task_pid_nr(current); void *p; - page->private = pid; + set_page_private(page, (unsigned long)pid); if (radix_tree_preload(GFP_NOFS)) return; @@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, radix_tree_for_each_slot(slot, &pids, &iter, first_index) { results[ret] = iter.index; - if (++ret == PIDVEC_SIZE) + if (++ret == max_items) break; } return ret; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1c4d5e39586c..ab658419552b 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -264,18 +264,123 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } +static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, + void **last_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + + list_for_each_xattr(entry, base_addr) { + if ((void *)entry + sizeof(__u32) > base_addr + inline_size || + (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > + base_addr + inline_size) { + *last_addr = entry; + return NULL; + } + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static int lookup_all_xattrs(struct inode *inode, struct page *ipage, + unsigned int index, unsigned int len, + const char *name, struct f2fs_xattr_entry **xe, + void **base_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + void *cur_addr, *txattr_addr, *last_addr = NULL; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; + unsigned int inline_size = inline_xattr_size(inode); + int err = 0; + + if (!size && !inline_size) + return -ENODATA; + + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); + if (!txattr_addr) + return -ENOMEM; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + *xe = __find_inline_xattr(txattr_addr, &last_addr, + index, len, name); + if (*xe) + goto check; + } + + /* read from xattr node block */ + if (xnid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + goto out; + } + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, size); + f2fs_put_page(xpage, 1); + } + + if (last_addr) + cur_addr = XATTR_HDR(last_addr) - 1; + else + cur_addr = txattr_addr; + + *xe = __find_xattr(cur_addr, index, len, name); +check: + if (IS_XATTR_LAST_ENTRY(*xe)) { + err = -ENODATA; + goto out; + } + + *base_addr = txattr_addr; + return 0; +out: + kzfree(txattr_addr); + return err; +} + static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; - size_t size = PAGE_SIZE, inline_size = 0; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); void *txattr_addr; int err; - inline_size = inline_xattr_size(inode); - - txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -299,19 +404,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, } /* read from xattr node block */ - if (F2FS_I(inode)->i_xattr_nid) { + if (xnid) { struct page *xpage; void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = get_node_page(sbi, xnid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); goto fail; } xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + memcpy(txattr_addr + inline_size, xattr_addr, size); f2fs_put_page(xpage, 1); } @@ -333,14 +438,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - size_t inline_size = 0; + size_t inline_size = inline_xattr_size(inode); void *xattr_addr; struct page *xpage; nid_t new_nid = 0; int err; - inline_size = inline_xattr_size(inode); - if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) return -ENOSPC; @@ -386,7 +489,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); @@ -395,23 +498,20 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - - sizeof(struct node_footer)); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); - /* need to checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); return 0; } int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { - struct f2fs_xattr_entry *entry; - void *base_addr; + struct f2fs_xattr_entry *entry = NULL; int error = 0; - size_t size, len; + unsigned int size, len; + void *base_addr = NULL; if (name == NULL) return -EINVAL; @@ -420,21 +520,18 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - error = read_all_xattrs(inode, ipage, &base_addr); + down_read(&F2FS_I(inode)->i_xattr_sem); + error = lookup_all_xattrs(inode, ipage, index, len, name, + &entry, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; - entry = __find_xattr(base_addr, index, len, name); - if (IS_XATTR_LAST_ENTRY(entry)) { - error = -ENODATA; - goto cleanup; - } - size = le16_to_cpu(entry->e_value_size); if (buffer && size > buffer_size) { error = -ERANGE; - goto cleanup; + goto out; } if (buffer) { @@ -442,8 +539,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, memcpy(buffer, pval, size); } error = size; - -cleanup: +out: kzfree(base_addr); return error; } @@ -456,7 +552,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; + down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -485,6 +583,15 @@ cleanup: return error; } +static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, + const void *value, size_t size) +{ + void *pval = entry->e_name + entry->e_name_len; + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); +} + static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) @@ -519,12 +626,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - if ((flags & XATTR_REPLACE) && !found) { + if (found) { + if ((flags & XATTR_CREATE)) { + error = -EEXIST; + goto exit; + } + + if (f2fs_xattr_value_same(here, value, size)) + goto exit; + } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; - } else if ((flags & XATTR_CREATE) && found) { - error = -EEXIST; - goto exit; } last = here; @@ -618,7 +730,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_lock_op(sbi); /* protect xattr_ver */ down_write(&F2FS_I(inode)->i_sem); + down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_xattr_sem); up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index d2fd0387a3c7..08a4840d6d7d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -58,10 +58,10 @@ struct f2fs_xattr_entry { #define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) -#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ - entry->e_name_len + le16_to_cpu(entry->e_value_size))) + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) #define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ ENTRY_SIZE(entry))) @@ -72,9 +72,10 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) - -#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ - sizeof(struct node_footer) - sizeof(__u32)) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ + VALID_XATTR_BLOCK_SIZE) #define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ sizeof(struct f2fs_xattr_header) - \ diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index cf54a312993f..c2a975e4a711 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -114,6 +114,8 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_TRIMMED_FLAG 0x00000100 +#define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 #define CP_FSCK_FLAG 0x00000010 @@ -184,6 +186,8 @@ struct f2fs_extent { #define F2FS_NAME_LEN 255 #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ +#define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \ + get_extra_isize(inode)) #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ #define ADDRS_PER_INODE(inode) addrs_per_inode(inode) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ @@ -203,9 +207,7 @@ struct f2fs_extent { #define F2FS_INLINE_DENTRY 0x04 /* file inline dentry flag */ #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ - -#define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \ - F2FS_INLINE_XATTR_ADDRS - 1)) +#define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */ struct f2fs_inode { __le16 i_mode; /* file mode */ @@ -233,8 +235,16 @@ struct f2fs_inode { struct f2fs_extent i_ext; /* caching a largest extent */ - __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ - + union { + struct { + __le16 i_extra_isize; /* extra inode attribute size */ + __le16 i_padding; /* padding */ + __le32 i_projid; /* project id */ + __le32 i_inode_checksum;/* inode meta checksum */ + __le32 i_extra_end[0]; /* for attribute size calculation */ + }; + __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ + }; __le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2), double_indirect(1) node id */ } __packed; @@ -278,6 +288,7 @@ struct f2fs_node { * For NAT entries */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) +#define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ @@ -462,7 +473,7 @@ typedef __le32 f2fs_hash_t; #define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1)) /* - * space utilization of regular dentry and inline dentry + * space utilization of regular dentry and inline dentry (w/o extra reservation) * regular dentry inline dentry * bitmap 1 * 27 = 27 1 * 23 = 23 * reserved 1 * 3 = 3 1 * 7 = 7 @@ -498,24 +509,6 @@ struct f2fs_dentry_block { __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN]; } __packed; -/* for inline dir */ -#define NR_INLINE_DENTRY (MAX_INLINE_DATA * BITS_PER_BYTE / \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - BITS_PER_BYTE + 1)) -#define INLINE_DENTRY_BITMAP_SIZE ((NR_INLINE_DENTRY + \ - BITS_PER_BYTE - 1) / BITS_PER_BYTE) -#define INLINE_RESERVED_SIZE (MAX_INLINE_DATA - \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE)) - -/* inline directory entry structure */ -struct f2fs_inline_dentry { - __u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE]; - __u8 reserved[INLINE_RESERVED_SIZE]; - struct f2fs_dir_entry dentry[NR_INLINE_DENTRY]; - __u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN]; -} __packed; - /* file types used in inode_info->flags */ enum { F2FS_FT_UNKNOWN, @@ -531,4 +524,6 @@ enum { #define S_SHIFT 12 +#define F2FS_DEF_PROJID 0 /* default project ID */ + #endif /* _LINUX_F2FS_FS_H */ diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h new file mode 100644 index 000000000000..4022c61f7e9b --- /dev/null +++ b/include/linux/fscrypt_common.h @@ -0,0 +1,138 @@ +/* + * fscrypt_common.h: common declarations for per-file encryption + * + * Copyright (C) 2015, Google, Inc. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#ifndef _LINUX_FSCRYPT_COMMON_H +#define _LINUX_FSCRYPT_COMMON_H + +#include +#include +#include +#include +#include +#include +#include + +#define FS_CRYPTO_BLOCK_SIZE 16 + +struct fscrypt_info; + +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ +}; + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + +struct fscrypt_str { + unsigned char *name; + u32 len; +}; + +struct fscrypt_name { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + u32 hash; + u32 minor_hash; + struct fscrypt_str crypto_buf; +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * fscrypt superblock flags + */ +#define FS_CFLG_OWN_PAGES (1U << 1) + +/* + * crypto opertions for filesystems + */ +struct fscrypt_operations { + unsigned int flags; + const char *key_prefix; + int (*get_context)(struct inode *, void *, size_t); + int (*set_context)(struct inode *, const void *, size_t, void *); + int (*dummy_context)(struct inode *); + bool (*is_encrypted)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + if (inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode)) + return true; + return false; +} + +static inline bool fscrypt_valid_enc_modes(u32 contents_mode, + u32 filenames_mode) +{ + if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && + filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) + return true; + + if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) + return true; + + return false; +} + +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +static inline struct page *fscrypt_control_page(struct page *page) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +#else + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +#endif +} + +static inline int fscrypt_has_encryption_key(const struct inode *inode) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return (inode->i_crypt_info != NULL); +#else + return 0; +#endif +} + +#endif /* _LINUX_FSCRYPT_COMMON_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h new file mode 100644 index 000000000000..ec406aed2f2f --- /dev/null +++ b/include/linux/fscrypt_notsupp.h @@ -0,0 +1,177 @@ +/* + * fscrypt_notsupp.h + * + * This stubs out the fscrypt functions for filesystems configured without + * encryption support. + */ + +#ifndef _LINUX_FSCRYPT_NOTSUPP_H +#define _LINUX_FSCRYPT_NOTSUPP_H + +#include + +/* crypto.c */ +static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, + gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx) +{ + return; +} + +static inline struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int fscrypt_decrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, unsigned int offs, + u64 lblk_num) +{ + return -EOPNOTSUPP; +} + + +static inline void fscrypt_restore_control_page(struct page *page) +{ + return; +} + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ + return; +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ + return; +} + +/* policy.c */ +static inline int fscrypt_ioctl_set_policy(struct file *filp, + const void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_has_permitted_context(struct inode *parent, + struct inode *child) +{ + return 0; +} + +static inline int fscrypt_inherit_context(struct inode *parent, + struct inode *child, + void *fs_data, bool preload) +{ + return -EOPNOTSUPP; +} + +/* keyinfo.c */ +static inline int fscrypt_get_encryption_info(struct inode *inode) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_put_encryption_info(struct inode *inode, + struct fscrypt_info *ci) +{ + return; +} + + /* fname.c */ +static inline int fscrypt_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct fscrypt_name *fname) +{ + if (dir->i_sb->s_cop->is_encrypted(dir)) + return -EOPNOTSUPP; + + memset(fname, 0, sizeof(struct fscrypt_name)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void fscrypt_free_filename(struct fscrypt_name *fname) +{ + return; +} + +static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode, + u32 ilen) +{ + /* never happens */ + WARN_ON(1); + return 0; +} + +static inline int fscrypt_fname_alloc_buffer(const struct inode *inode, + u32 ilen, + struct fscrypt_str *crypto_str) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) +{ + return; +} + +static inline int fscrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + /* Encryption support disabled; use standard comparison */ + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + +/* bio.c */ +static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, + struct bio *bio) +{ + return; +} + +static inline void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + return; +} + +static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + return -EOPNOTSUPP; +} + +#endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h new file mode 100644 index 000000000000..32e2fcf13b01 --- /dev/null +++ b/include/linux/fscrypt_supp.h @@ -0,0 +1,145 @@ +/* + * fscrypt_supp.h + * + * This is included by filesystems configured with encryption support. + */ + +#ifndef _LINUX_FSCRYPT_SUPP_H +#define _LINUX_FSCRYPT_SUPP_H + +#include + +/* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; +extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); +extern void fscrypt_release_ctx(struct fscrypt_ctx *); +extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, + unsigned int, unsigned int, + u64, gfp_t); +extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int, + unsigned int, u64); +extern void fscrypt_restore_control_page(struct page *); + +extern const struct dentry_operations fscrypt_d_ops; + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ + d_set_d_op(dentry, &fscrypt_d_ops); +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); +} + +/* policy.c */ +extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); +extern int fscrypt_ioctl_get_policy(struct file *, void __user *); +extern int fscrypt_has_permitted_context(struct inode *, struct inode *); +extern int fscrypt_inherit_context(struct inode *, struct inode *, + void *, bool); +/* keyinfo.c */ +extern int fscrypt_get_encryption_info(struct inode *); +extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); + +/* fname.c */ +extern int fscrypt_setup_filename(struct inode *, const struct qstr *, + int lookup, struct fscrypt_name *); + +static inline void fscrypt_free_filename(struct fscrypt_name *fname) +{ + kfree(fname->crypto_buf.name); +} + +extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); +extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, + struct fscrypt_str *); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, + const struct fscrypt_str *, struct fscrypt_str *); +extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, + struct fscrypt_str *); + +#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32 + +/* Extracts the second-to-last ciphertext block; see explanation below */ +#define FSCRYPT_FNAME_DIGEST(name, len) \ + ((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \ + FS_CRYPTO_BLOCK_SIZE)) + +#define FSCRYPT_FNAME_DIGEST_SIZE FS_CRYPTO_BLOCK_SIZE + +/** + * fscrypt_digested_name - alternate identifier for an on-disk filename + * + * When userspace lists an encrypted directory without access to the key, + * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE + * bytes are shown in this abbreviated form (base64-encoded) rather than as the + * full ciphertext (base64-encoded). This is necessary to allow supporting + * filenames up to NAME_MAX bytes, since base64 encoding expands the length. + * + * To make it possible for filesystems to still find the correct directory entry + * despite not knowing the full on-disk name, we encode any filesystem-specific + * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups, + * followed by the second-to-last ciphertext block of the filename. Due to the + * use of the CBC-CTS encryption mode, the second-to-last ciphertext block + * depends on the full plaintext. (Note that ciphertext stealing causes the + * last two blocks to appear "flipped".) This makes accidental collisions very + * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they + * share the same filesystem-specific hashes. + * + * However, this scheme isn't immune to intentional collisions, which can be + * created by anyone able to create arbitrary plaintext filenames and view them + * without the key. Making the "digest" be a real cryptographic hash like + * SHA-256 over the full ciphertext would prevent this, although it would be + * less efficient and harder to implement, especially since the filesystem would + * need to calculate it for each directory entry examined during a search. + */ +struct fscrypt_digested_name { + u32 hash; + u32 minor_hash; + u8 digest[FSCRYPT_FNAME_DIGEST_SIZE]; +}; + +/** + * fscrypt_match_name() - test whether the given name matches a directory entry + * @fname: the name being searched for + * @de_name: the name from the directory entry + * @de_name_len: the length of @de_name in bytes + * + * Normally @fname->disk_name will be set, and in that case we simply compare + * that to the name stored in the directory entry. The only exception is that + * if we don't have the key for an encrypted directory and a filename in it is + * very long, then we won't have the full disk_name and we'll instead need to + * match against the fscrypt_digested_name. + * + * Return: %true if the name matches, otherwise %false. + */ +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + if (unlikely(!fname->disk_name.name)) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_')) + return false; + if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) + return false; + return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len), + n->digest, FSCRYPT_FNAME_DIGEST_SIZE); + } + + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + +/* bio.c */ +extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_pullback_bio_page(struct page **, bool); +extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, + unsigned int); + +#endif /* _LINUX_FSCRYPT_SUPP_H */ diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index ff8b11b26f31..e6e53a36104b 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -250,8 +250,8 @@ extern void fscrypt_restore_control_page(struct page *); extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, unsigned int); /* policy.c */ -extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); -extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); +extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); +extern int fscrypt_ioctl_get_policy(struct file *, void __user *); extern int fscrypt_has_permitted_context(struct inode *, struct inode *); extern int fscrypt_inherit_context(struct inode *, struct inode *, void *, bool); @@ -320,14 +320,14 @@ static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, } /* policy.c */ -static inline int fscrypt_notsupp_process_policy(struct file *f, - const struct fscrypt_policy *p) +static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f, + const void __user *arg) { return -EOPNOTSUPP; } -static inline int fscrypt_notsupp_get_policy(struct inode *i, - struct fscrypt_policy *p) +static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f, + void __user *arg) { return -EOPNOTSUPP; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 217691582dd4..7063bbcca03b 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -6,8 +6,8 @@ #include -#define show_dev(entry) MAJOR(entry->dev), MINOR(entry->dev) -#define show_dev_ino(entry) show_dev(entry), (unsigned long)entry->ino +#define show_dev(dev) MAJOR(dev), MINOR(dev) +#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); @@ -15,6 +15,7 @@ TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); TRACE_DEFINE_ENUM(INMEM); TRACE_DEFINE_ENUM(INMEM_DROP); +TRACE_DEFINE_ENUM(INMEM_INVALIDATE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(CURSEG_HOT_DATA); @@ -43,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT); TRACE_DEFINE_ENUM(CP_SYNC); TRACE_DEFINE_ENUM(CP_RECOVERY); TRACE_DEFINE_ENUM(CP_DISCARD); +TRACE_DEFINE_ENUM(CP_TRIMMED); #define show_block_type(type) \ __print_symbolic(type, \ @@ -52,6 +54,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -80,6 +83,12 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { REQ_META | REQ_PRIO, "(MP)" }, \ { 0, " \b" }) +#define show_block_temp(temp) \ + __print_symbolic(temp, \ + { HOT, "HOT" }, \ + { WARM, "WARM" }, \ + { COLD, "COLD" }) + #define show_data_type(type) \ __print_symbolic(type, \ { CURSEG_HOT_DATA, "Hot DATA" }, \ @@ -116,7 +125,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { CP_FASTBOOT, "Fastboot" }, \ { CP_SYNC, "Sync" }, \ { CP_RECOVERY, "Recovery" }, \ - { CP_DISCARD, "Discard" }) + { CP_DISCARD, "Discard" }, \ + { CP_UMOUNT | CP_TRIMMED, "Umount,Trimmed" }) struct victim_sel_policy; struct f2fs_map_blocks; @@ -239,7 +249,7 @@ TRACE_EVENT(f2fs_sync_fs, ), TP_printk("dev = (%d,%d), superblock is %s, wait = %d", - show_dev(__entry), + show_dev(__entry->dev), __entry->dirty ? "dirty" : "not dirty", __entry->wait) ); @@ -309,6 +319,13 @@ DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit, TP_ARGS(inode, ret) ); +DEFINE_EVENT(f2fs__inode_exit, f2fs_drop_inode, + + TP_PROTO(struct inode *inode, int ret), + + TP_ARGS(inode, ret) +); + DEFINE_EVENT(f2fs__inode, f2fs_truncate, TP_PROTO(struct inode *inode), @@ -518,14 +535,14 @@ TRACE_EVENT(f2fs_map_blocks, TRACE_EVENT(f2fs_background_gc, - TP_PROTO(struct super_block *sb, long wait_ms, + TP_PROTO(struct super_block *sb, unsigned int wait_ms, unsigned int prefree, unsigned int free), TP_ARGS(sb, wait_ms, prefree, free), TP_STRUCT__entry( __field(dev_t, dev) - __field(long, wait_ms) + __field(unsigned int, wait_ms) __field(unsigned int, prefree) __field(unsigned int, free) ), @@ -537,13 +554,120 @@ TRACE_EVENT(f2fs_background_gc, __entry->free = free; ), - TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u", - show_dev(__entry), + TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u", + show_dev(__entry->dev), __entry->wait_ms, __entry->prefree, __entry->free) ); +TRACE_EVENT(f2fs_gc_begin, + + TP_PROTO(struct super_block *sb, bool sync, bool background, + long long dirty_nodes, long long dirty_dents, + long long dirty_imeta, unsigned int free_sec, + unsigned int free_seg, int reserved_seg, + unsigned int prefree_seg), + + TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(bool, sync) + __field(bool, background) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->sync = sync; + __entry->background = background; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " + "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + "rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->sync, + __entry->background, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + +TRACE_EVENT(f2fs_gc_end, + + TP_PROTO(struct super_block *sb, int ret, int seg_freed, + int sec_freed, long long dirty_nodes, + long long dirty_dents, long long dirty_imeta, + unsigned int free_sec, unsigned int free_seg, + int reserved_seg, unsigned int prefree_seg), + + TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents, + dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, ret) + __field(int, seg_freed) + __field(int, sec_freed) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->ret = ret; + __entry->seg_freed = seg_freed; + __entry->sec_freed = sec_freed; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, " + "free_seg:%u, rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->ret, + __entry->seg_freed, + __entry->sec_freed, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + TRACE_EVENT(f2fs_get_victim, TP_PROTO(struct super_block *sb, int type, int gc_type, @@ -580,7 +704,7 @@ TRACE_EVENT(f2fs_get_victim, TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), victim = %u " "ofs_unit = %u, pre_victim_secno = %d, prefree = %u, free = %u", - show_dev(__entry), + show_dev(__entry->dev), show_data_type(__entry->type), show_gc_type(__entry->gc_type), show_alloc_mode(__entry->alloc_mode), @@ -717,7 +841,7 @@ TRACE_EVENT(f2fs_reserve_new_blocks, ), TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", - show_dev(__entry), + show_dev(__entry->dev), (unsigned int)__entry->nid, __entry->ofs_in_node, (unsigned long long)__entry->count) @@ -737,6 +861,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(block_t, new_blkaddr) __field(int, op) __field(int, op_flags) + __field(int, temp) __field(int, type) ), @@ -748,16 +873,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->new_blkaddr = fio->new_blkaddr; __entry->op = fio->op; __entry->op_flags = fio->op_flags; + __entry->temp = fio->temp; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s", show_dev_ino(__entry), (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, show_bio_type(__entry->op, __entry->op_flags), + show_block_temp(__entry->temp), show_block_type(__entry->type)) ); @@ -770,7 +897,7 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio, TP_CONDITION(page->mapping) ); -DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio, +DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_write, TP_PROTO(struct page *page, struct f2fs_io_info *fio), @@ -787,6 +914,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, TP_STRUCT__entry( __field(dev_t, dev) + __field(dev_t, target) __field(int, op) __field(int, op_flags) __field(int, type) @@ -796,6 +924,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, TP_fast_assign( __entry->dev = sb->s_dev; + __entry->target = bio->bi_bdev->bd_dev; __entry->op = bio_op(bio); __entry->op_flags = bio->bi_rw; __entry->type = type; @@ -803,8 +932,9 @@ DECLARE_EVENT_CLASS(f2fs__bio, __entry->size = bio->bi_iter.bi_size; ), - TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u", - show_dev(__entry), + TP_printk("dev = (%d,%d)/(%d,%d), rw = %s%s, %s, sector = %lld, size = %u", + show_dev(__entry->target), + show_dev(__entry->dev), show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, @@ -1101,16 +1231,16 @@ TRACE_EVENT(f2fs_write_checkpoint, ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", - show_dev(__entry), + show_dev(__entry->dev), show_cpreason(__entry->reason), __entry->msg) ); -TRACE_EVENT(f2fs_issue_discard, +DECLARE_EVENT_CLASS(f2fs_discard, - TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen), + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), - TP_ARGS(sb, blkstart, blklen), + TP_ARGS(dev, blkstart, blklen), TP_STRUCT__entry( __field(dev_t, dev) @@ -1119,22 +1249,36 @@ TRACE_EVENT(f2fs_issue_discard, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; __entry->blklen = blklen; ), TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx", - show_dev(__entry), + show_dev(__entry->dev), (unsigned long long)__entry->blkstart, (unsigned long long)__entry->blklen) ); +DEFINE_EVENT(f2fs_discard, f2fs_queue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + +DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + TRACE_EVENT(f2fs_issue_reset_zone, - TP_PROTO(struct super_block *sb, block_t blkstart), + TP_PROTO(struct block_device *dev, block_t blkstart), - TP_ARGS(sb, blkstart), + TP_ARGS(dev, blkstart), TP_STRUCT__entry( __field(dev_t, dev) @@ -1142,38 +1286,41 @@ TRACE_EVENT(f2fs_issue_reset_zone, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; ), TP_printk("dev = (%d,%d), reset zone at block = 0x%llx", - show_dev(__entry), + show_dev(__entry->dev), (unsigned long long)__entry->blkstart) ); TRACE_EVENT(f2fs_issue_flush, - TP_PROTO(struct super_block *sb, unsigned int nobarrier, - unsigned int flush_merge), + TP_PROTO(struct block_device *dev, unsigned int nobarrier, + unsigned int flush_merge, int ret), - TP_ARGS(sb, nobarrier, flush_merge), + TP_ARGS(dev, nobarrier, flush_merge, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, nobarrier) __field(unsigned int, flush_merge) + __field(int, ret) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; + __entry->ret = ret; ), - TP_printk("dev = (%d,%d), %s %s", - show_dev(__entry), + TP_printk("dev = (%d,%d), %s %s, ret = %d", + show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", - __entry->flush_merge ? " with flush_merge" : "") + __entry->flush_merge ? " with flush_merge" : "", + __entry->ret) ); TRACE_EVENT(f2fs_lookup_extent_tree_start, @@ -1286,7 +1433,7 @@ TRACE_EVENT(f2fs_shrink_extent_tree, ), TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u", - show_dev(__entry), + show_dev(__entry->dev), __entry->node_cnt, __entry->tree_cnt) ); @@ -1333,7 +1480,7 @@ DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, ), TP_printk("dev = (%d,%d), %s, dirty count = %lld", - show_dev(__entry), + show_dev(__entry->dev), show_file_type(__entry->type), __entry->count) ); diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 15048097910f..60d27496c328 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -178,6 +178,23 @@ struct inodes_stat_t { /* Policy provided via an ioctl on the topmost directory */ #define FS_KEY_DESCRIPTOR_SIZE 8 +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 +#define FS_ENCRYPTION_MODE_AES_128_CBC 5 +#define FS_ENCRYPTION_MODE_AES_128_CTS 6 + + struct fscrypt_policy { __u8 version; __u8 contents_encryption_mode; @@ -190,6 +207,19 @@ struct fscrypt_policy { #define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) +/* Parameters for passing an encryption key into the kernel keyring */ +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* Structure that userspace passes to the kernel keyring */ +#define FS_MAX_KEY_SIZE 64 + +struct fscrypt_key { + __u32 mode; + __u8 raw[FS_MAX_KEY_SIZE]; + __u32 size; +}; + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) */ diff --git a/mm/util.c b/mm/util.c index d5259b62f8d7..d7b1065644be 100644 --- a/mm/util.c +++ b/mm/util.c @@ -348,6 +348,7 @@ struct address_space *page_mapping(struct page *page) return NULL; return page->mapping; } +EXPORT_SYMBOL(page_mapping); int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, From e5e42eca05ada917895af063bc46badf1515043c Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Mon, 8 May 2017 09:33:22 -0700 Subject: [PATCH 1154/3220] ANDROID: binder: Add tracing for binder priority inheritance. Bug: 34461621 Change-Id: I5ebb1c0c49fd42a89ee250a1d70221f767c82c7c Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 4 ++++ drivers/android/binder_trace.h | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 32a2b2f44691..64f393239f6f 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1154,6 +1154,10 @@ static void binder_do_set_priority(struct task_struct *task, task->pid, desired.prio, to_kernel_prio(policy, priority)); + trace_binder_set_priority(task->tgid, task->pid, task->normal_prio, + to_kernel_prio(policy, priority), + desired.prio); + /* Set the actual priority */ if (task->policy != policy || is_rt_policy(policy)) { struct sched_param params; diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index 76e3b9c8a8a2..b11dffc521e8 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -85,6 +85,30 @@ DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done); DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done); DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done); +TRACE_EVENT(binder_set_priority, + TP_PROTO(int proc, int thread, unsigned int old_prio, + unsigned int desired_prio, unsigned int new_prio), + TP_ARGS(proc, thread, old_prio, new_prio, desired_prio), + + TP_STRUCT__entry( + __field(int, proc) + __field(int, thread) + __field(unsigned int, old_prio) + __field(unsigned int, new_prio) + __field(unsigned int, desired_prio) + ), + TP_fast_assign( + __entry->proc = proc; + __entry->thread = thread; + __entry->old_prio = old_prio; + __entry->new_prio = new_prio; + __entry->desired_prio = desired_prio; + ), + TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d", + __entry->proc, __entry->thread, __entry->old_prio, + __entry->new_prio, __entry->desired_prio) +); + TRACE_EVENT(binder_wait_for_work, TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo), TP_ARGS(proc_work, transaction_stack, thread_todo), From 3cc621033b682479fb7fdd4028d50e7b7458a08e Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Thu, 24 Aug 2017 15:23:36 +0200 Subject: [PATCH 1155/3220] ANDROID: binder: fix transaction leak. If a call to put_user() fails, we failed to properly free a transaction and send a failed reply (if necessary). Bug: 63117588 Test: binderLibTest Change-Id: Ia98db8cd82ce354a4cdc8811c969988d585c7e31 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 64f393239f6f..bc8d9ecfebec 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2106,6 +2106,26 @@ static void binder_send_failed_reply(struct binder_transaction *t, } } +/** + * binder_cleanup_transaction() - cleans up undelivered transaction + * @t: transaction that needs to be cleaned up + * @reason: reason the transaction wasn't delivered + * @error_code: error to return to caller (if synchronous call) + */ +static void binder_cleanup_transaction(struct binder_transaction *t, + const char *reason, + uint32_t error_code) +{ + if (t->buffer->target_node && !(t->flags & TF_ONE_WAY)) { + binder_send_failed_reply(t, error_code); + } else { + binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, + "undelivered transaction %d, %s\n", + t->debug_id, reason); + binder_free_transaction(t); + } +} + /** * binder_validate_object() - checks for a valid metadata object in a buffer. * @buffer: binder_buffer that we're parsing. @@ -4188,12 +4208,20 @@ retry: if (put_user(cmd, (uint32_t __user *)ptr)) { if (t_from) binder_thread_dec_tmpref(t_from); + + binder_cleanup_transaction(t, "put_user failed", + BR_FAILED_REPLY); + return -EFAULT; } ptr += sizeof(uint32_t); if (copy_to_user(ptr, &tr, sizeof(tr))) { if (t_from) binder_thread_dec_tmpref(t_from); + + binder_cleanup_transaction(t, "copy_to_user failed", + BR_FAILED_REPLY); + return -EFAULT; } ptr += sizeof(tr); @@ -4263,15 +4291,9 @@ static void binder_release_work(struct binder_proc *proc, struct binder_transaction *t; t = container_of(w, struct binder_transaction, work); - if (t->buffer->target_node && - !(t->flags & TF_ONE_WAY)) { - binder_send_failed_reply(t, BR_DEAD_REPLY); - } else { - binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, - "undelivered transaction %d\n", - t->debug_id); - binder_free_transaction(t); - } + + binder_cleanup_transaction(t, "process died.", + BR_DEAD_REPLY); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( From 139ac8ac89e5918fcbafa24e849e8c277fe2f66c Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 5 Oct 2017 17:54:31 -0700 Subject: [PATCH 1156/3220] FROMLIST: tracing: Prepare to add preempt and irq trace events In preparation of adding irqsoff and preemptsoff enable and disable trace events, move required functions and code to make it easier to add these events in a later patch. This patch is just code movement and no functional change. Change-Id: I587d411da5efbc4959bcccd7a05c7a66c231e1e0 Cc: Steven Rostedt Cc: Peter Zijlstra Cc: kernel-team@android.com Link: https://patchwork.kernel.org/patch/9988159/ Signed-off-by: Joel Fernandes --- kernel/trace/trace_irqsoff.c | 100 ++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index be3222b7d72e..0e56eace0dde 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,7 @@ #include "trace.h" +#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -450,64 +451,44 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) #else /* !CONFIG_PROVE_LOCKING */ -/* - * Stubs: - */ - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} - /* * We are only interested in hardirq on/off events: */ -void trace_hardirqs_on(void) +static inline void tracer_hardirqs_on(void) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_on); -void trace_hardirqs_off(void) +static inline void tracer_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_off); -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_on_caller); -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_PROVE_LOCKING */ #endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER -void trace_preempt_on(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } -void trace_preempt_off(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); @@ -770,3 +751,70 @@ __init static int init_irqsoff_tracer(void) return 0; } core_initcall(init_irqsoff_tracer); +#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ + +#ifndef CONFIG_IRQSOFF_TRACER +static inline void tracer_hardirqs_on(void) { } +static inline void tracer_hardirqs_off(void) { } +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { } +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { } +#endif + +#ifndef CONFIG_PREEMPT_TRACER +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#endif + +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) +void trace_hardirqs_on(void) +{ + tracer_hardirqs_on(); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + tracer_hardirqs_off(); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + tracer_hardirqs_on_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + tracer_hardirqs_off_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +/* + * Stubs: + */ + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} +#endif + +#ifdef CONFIG_PREEMPT_TRACER +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + tracer_preempt_on(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + tracer_preempt_off(a0, a1); +} +#endif From e5486e9c8991aa5b55959c96b6b375645b12dad6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 5 Oct 2017 17:54:32 -0700 Subject: [PATCH 1157/3220] FROMLIST: tracing: Add support for preempt and irq enable/disable events Preempt and irq trace events can be used for tracing the start and end of an atomic section which can be used by a trace viewer like systrace to graphically view the start and end of an atomic section and correlate them with latencies and scheduling issues. This also serves as a prelude to using synthetic events or probes to rewrite the preempt and irqsoff tracers, along with numerous benefits of using trace events features for these events. Change-Id: I718d40f7c3c48579adf9d7121b21495a669c89bd Cc: Steven Rostedt Cc: Peter Zilstra Cc: kernel-team@android.com Link: https://patchwork.kernel.org/patch/9988157/ Signed-off-by: Joel Fernandes --- include/linux/ftrace.h | 3 +- include/trace/events/preemptirq.h | 70 +++++++++++++++++++++++++++++++ kernel/trace/Kconfig | 11 +++++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 35 +++++++++++++++- 5 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 include/trace/events/preemptirq.h diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 60048c50404e..ed94cea9eaff 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -702,7 +702,8 @@ static inline void __ftrace_enabled_restore(int enabled) static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { } #endif -#ifdef CONFIG_PREEMPT_TRACER +#if defined(CONFIG_PREEMPT_TRACER) || \ + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) extern void trace_preempt_on(unsigned long a0, unsigned long a1); extern void trace_preempt_off(unsigned long a0, unsigned long a1); #else diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h new file mode 100644 index 000000000000..f5024c560d8f --- /dev/null +++ b/include/trace/events/preemptirq.h @@ -0,0 +1,70 @@ +#ifdef CONFIG_PREEMPTIRQ_EVENTS + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM preemptirq + +#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PREEMPTIRQ_H + +#include +#include +#include +#include + +DECLARE_EVENT_CLASS(preemptirq_template, + + TP_PROTO(unsigned long ip, unsigned long parent_ip), + + TP_ARGS(ip, parent_ip), + + TP_STRUCT__entry( + __field(u32, caller_offs) + __field(u32, parent_offs) + ), + + TP_fast_assign( + __entry->caller_offs = (u32)(ip - (unsigned long)_stext); + __entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext); + ), + + TP_printk("caller=%pF parent=%pF", + (void *)((unsigned long)(_stext) + __entry->caller_offs), + (void *)((unsigned long)(_stext) + __entry->parent_offs)) +); + +#ifndef CONFIG_PROVE_LOCKING +DEFINE_EVENT(preemptirq_template, irq_disable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); + +DEFINE_EVENT(preemptirq_template, irq_enable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); +#endif + +#ifdef CONFIG_DEBUG_PREEMPT +DEFINE_EVENT(preemptirq_template, preempt_disable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); + +DEFINE_EVENT(preemptirq_template, preempt_enable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); +#endif + +#endif /* _TRACE_PREEMPTIRQ_H */ + +#include + +#else /* !CONFIG_PREEMPTIRQ_EVENTS */ + +#define trace_irq_enable(...) +#define trace_irq_disable(...) +#define trace_preempt_enable(...) +#define trace_preempt_disable(...) +#define trace_irq_enable_rcuidle(...) +#define trace_irq_disable_rcuidle(...) +#define trace_preempt_enable_rcuidle(...) +#define trace_preempt_disable_rcuidle(...) + +#endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5f5b66a2f156..006eefb6ede0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -165,6 +165,17 @@ config FUNCTION_GRAPH_TRACER address on the current task structure into a stack of calls. +config PREEMPTIRQ_EVENTS + bool "Enable trace events for preempt and irq disable/enable" + select TRACE_IRQFLAGS + depends on DEBUG_PREEMPT || !PROVE_LOCKING + default n + help + Enable tracing of disable and enable events for preemption and irqs. + For tracing preempt disable/enable events, DEBUG_PREEMPT must be + enabled. For tracing irq disable/enable events, PROVE_LOCKING must + be disabled. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index a9bba37fab5a..4b35fb97ae44 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o +obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 0e56eace0dde..21b162c07e83 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,9 @@ #include "trace.h" +#define CREATE_TRACE_POINTS +#include + #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -765,27 +768,54 @@ static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } #endif +/* Per-cpu variable to prevent redundant calls when IRQs already off */ +static DEFINE_PER_CPU(int, tracing_irq_cpu); + #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) void trace_hardirqs_on(void) { + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(); + + this_cpu_write(tracing_irq_cpu, 0); } EXPORT_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_off(); } EXPORT_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_on_caller(caller_addr); + + this_cpu_write(tracing_irq_cpu, 0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_off_caller(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -807,14 +837,17 @@ inline void print_irqtrace_events(struct task_struct *curr) } #endif -#ifdef CONFIG_PREEMPT_TRACER +#if defined(CONFIG_PREEMPT_TRACER) || \ + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) void trace_preempt_on(unsigned long a0, unsigned long a1) { + trace_preempt_enable_rcuidle(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { + trace_preempt_disable_rcuidle(a0, a1); tracer_preempt_off(a0, a1); } #endif From 3ef0a0c8590b07fa8dfe0adb2bef6cc3baec5261 Mon Sep 17 00:00:00 2001 From: Behan Webster Date: Fri, 21 Apr 2017 11:20:01 -0700 Subject: [PATCH 1158/3220] UPSTREAM: kbuild: Add better clang cross build support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add cross target to CC if using clang. Also add custom gcc toolchain path for fallback gcc tools. Clang will fallback to using things like ld, as, and libgcc if (respectively) one of the llvm linkers isn't available, the integrated assembler is turned off, or an appropriately cross-compiled version of compiler-rt isn't available. To this end, you can specify the path to this fallback gcc toolchain with GCC_TOOLCHAIN. Signed-off-by: Behan Webster Reviewed-by: Jan-Simon Möller Reviewed-by: Mark Charlebois Signed-off-by: Greg Hackmann Signed-off-by: Matthias Kaehlcke Signed-off-by: Masahiro Yamada (cherry picked from commit 785f11aa595bc3d4e74096cbd598ada54ecc0d81) Signed-off-by: Greg Hackmann Change-Id: I9e4ca1a149bc793b749952f1b5734bbc11777e65 --- Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile b/Makefile index 0b037152f590..e87105a95b4e 100644 --- a/Makefile +++ b/Makefile @@ -694,6 +694,15 @@ endif KBUILD_CFLAGS += $(stackp-flag) ifeq ($(cc-name),clang) +ifneq ($(CROSS_COMPILE),) +CLANG_TARGET := -target $(notdir $(CROSS_COMPILE:%-=%)) +GCC_TOOLCHAIN := $(realpath $(dir $(shell which $(LD)))/..) +endif +ifneq ($(GCC_TOOLCHAIN),) +CLANG_GCC_TC := -gcc-toolchain $(GCC_TOOLCHAIN) +endif +KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) +KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,) KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable) From 8a89d5fc7cce80477a8e8405d8f9c811758a16ed Mon Sep 17 00:00:00 2001 From: Michael Davidson Date: Tue, 25 Apr 2017 15:47:35 -0700 Subject: [PATCH 1159/3220] UPSTREAM: kbuild: clang: add -no-integrated-as to KBUILD_[AC]FLAGS The Linux Kernel relies on GCC's acceptance of inline assembly as an opaque object which will not have any validation performed on the content. The current behaviour in LLVM is to perform validation of the contents by means of parsing the input if the MC layer can handle it. Disable clangs integrated assembler and use the GNU assembler instead. Wording-mostly-from: Saleem Abdulrasool Signed-off-by: Michael Davidson Signed-off-by: Matthias Kaehlcke Signed-off-by: Masahiro Yamada (cherry picked from commit a37c45cd82e62a361706b9688a984a3a63957321) Signed-off-by: Greg Hackmann Change-Id: Iae412ad5294f8f5e4d66f0085a5dd70f5464ac91 --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index e87105a95b4e..75895a4b74b8 100644 --- a/Makefile +++ b/Makefile @@ -715,6 +715,8 @@ KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) # See modpost pattern 2 KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,) KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior) +KBUILD_CFLAGS += $(call cc-option, -no-integrated-as) +KBUILD_AFLAGS += $(call cc-option, -no-integrated-as) else # These warnings generated too much noise in a regular build. From a3337e2374f1bddb7fcbc4d14a76b95989f949bb Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 12 Apr 2017 12:43:52 -0700 Subject: [PATCH 1160/3220] UPSTREAM: kbuild: Consolidate header generation from ASM offset information Largely redundant code is used in different places to generate C headers from offset information extracted from assembly language output. Consolidate the code in Makefile.lib and use this instead. Signed-off-by: Matthias Kaehlcke Signed-off-by: Masahiro Yamada (cherry picked from commit ebf003f0cfb3705e60d40dedc3ec949176c741af) Signed-off-by: Greg Hackmann Change-Id: I0acd54dd27c0cf0868f221bd63728a9b67320b25 --- Kbuild | 25 ------------------------- arch/ia64/kernel/Makefile | 26 ++------------------------ scripts/Makefile.lib | 28 ++++++++++++++++++++++++++++ scripts/mod/Makefile | 28 ++-------------------------- 4 files changed, 32 insertions(+), 75 deletions(-) diff --git a/Kbuild b/Kbuild index f55cefd9bf29..f56ed561a284 100644 --- a/Kbuild +++ b/Kbuild @@ -6,31 +6,6 @@ # 3) Generate asm-offsets.h (may need bounds.h and timeconst.h) # 4) Check for missing system calls -# Default sed regexp - multiline due to syntax constraints -define sed-y - "/^->/{s:->#\(.*\):/* \1 */:; \ - s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:->::; p;}" -endef - -# Use filechk to avoid rebuilds when a header changes, but the resulting file -# does not -define filechk_offsets - (set -e; \ - echo "#ifndef $2"; \ - echo "#define $2"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y); \ - echo ""; \ - echo "#endif" ) -endef - ##### # 1) Generate bounds.h diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 3686d6abafde..9edda5466020 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -50,32 +50,10 @@ CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 # The gate DSO image is built using a special linker script. include $(src)/Makefile.gate -# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config -define sed-y - "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}" -endef -quiet_cmd_nr_irqs = GEN $@ -define cmd_nr_irqs - (set -e; \ - echo "#ifndef __ASM_NR_IRQS_H__"; \ - echo "#define __ASM_NR_IRQS_H__"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " *"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y) $<; \ - echo ""; \ - echo "#endif" ) > $@ -endef - # We use internal kbuild rules to avoid the "is up to date" message from make arch/$(SRCARCH)/kernel/nr-irqs.s: arch/$(SRCARCH)/kernel/nr-irqs.c $(Q)mkdir -p $(dir $@) $(call if_changed_dep,cc_s_c) -include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s - $(Q)mkdir -p $(dir $@) - $(call cmd,nr_irqs) +include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s FORCE + $(call filechk,offsets,__ASM_NR_IRQS_H__) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index c84080885ad4..a6a330671eba 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -394,3 +394,31 @@ quiet_cmd_xzmisc = XZMISC $@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \ xz --check=crc32 --lzma2=dict=1MiB) > $@ || \ (rm -f $@ ; false) + +# ASM offsets +# --------------------------------------------------------------------------- + +# Default sed regexp - multiline due to syntax constraints +define sed-offsets + "/^->/{s:->#\(.*\):/* \1 */:; \ + s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ + s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ + s:->::; p;}" +endef + +# Use filechk to avoid rebuilds when a header changes, but the resulting file +# does not +define filechk_offsets + (set -e; \ + echo "#ifndef $2"; \ + echo "#define $2"; \ + echo "/*"; \ + echo " * DO NOT MODIFY."; \ + echo " *"; \ + echo " * This file was generated by Kbuild"; \ + echo " */"; \ + echo ""; \ + sed -ne $(sed-offsets); \ + echo ""; \ + echo "#endif" ) +endef diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile index c11212ff3510..e0cb2e4a3b15 100644 --- a/scripts/mod/Makefile +++ b/scripts/mod/Makefile @@ -5,32 +5,8 @@ modpost-objs := modpost.o file2alias.o sumversion.o devicetable-offsets-file := devicetable-offsets.h -define sed-y - "/^->/{s:->#\(.*\):/* \1 */:; \ - s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:->::; p;}" -endef - -quiet_cmd_offsets = GEN $@ -define cmd_offsets - (set -e; \ - echo "#ifndef __DEVICETABLE_OFFSETS_H__"; \ - echo "#define __DEVICETABLE_OFFSETS_H__"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " *"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y) $<; \ - echo ""; \ - echo "#endif" ) > $@ -endef - -$(obj)/$(devicetable-offsets-file): $(obj)/devicetable-offsets.s - $(call if_changed,offsets) +$(obj)/$(devicetable-offsets-file): $(obj)/devicetable-offsets.s FORCE + $(call filechk,offsets,__DEVICETABLE_OFFSETS_H__) targets += $(devicetable-offsets-file) devicetable-offsets.s From 54d8c15081750c6a44369bc9372e1277361fd480 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 21 Apr 2017 15:21:10 +0900 Subject: [PATCH 1161/3220] UPSTREAM: kbuild: consolidate redundant sed script ASM offset generation This part ended up in redundant code after touched by multiple people. [1] Commit 3234282f33b2 ("x86, asm: Fix CFI macro invocations to deal with shortcomings in gas") added parentheses for defined expressions to support old gas for x86. [2] Commit a22dcdb0032c ("x86, asm: Fix ancient-GAS workaround") split the pattern into two to avoid parentheses for non-numeric expressions. [3] Commit 95a2f6f72d37 ("Partially revert patch that encloses asm-offset.h numbers in brackets") removed parentheses from numeric expressions as well because parentheses in MN10300 assembly have a special meaning (pointer access). Apparently, there is a conflict between [1] and [3]. After all, [3] took precedence, and a long time has passed since then. Now, merge the two patterns again because the first one is covered by the other. Signed-off-by: Masahiro Yamada Reviewed-by: Matthias Kaehlcke (cherry picked from commit 7dd47b95b0f54f2057d40af6e66d477e3fe95d13) Signed-off-by: Greg Hackmann Change-Id: Idf9e632df984fbc9cb834e7f7b5d33f21da87dbc --- scripts/Makefile.lib | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index a6a330671eba..6219c5fcb801 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -401,7 +401,6 @@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \ # Default sed regexp - multiline due to syntax constraints define sed-offsets "/^->/{s:->#\(.*\):/* \1 */:; \ - s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ s:->::; p;}" endef From 03e66b365eda022da0e82df4685e1aefcbdc656e Mon Sep 17 00:00:00 2001 From: Jeroen Hofstee Date: Fri, 21 Apr 2017 15:21:11 +0900 Subject: [PATCH 1162/3220] UPSTREAM: kbuild: fix asm-offset generation to work with clang KBuild abuses the asm statement to write to a file and clang chokes about these invalid asm statements. Hack it even more by fooling this is actual valid asm code. [masahiro: Import Jeroen's work for U-Boot: http://patchwork.ozlabs.org/patch/375026/ Tweak sed script a little to avoid garbage '#' for GCC case, like #define NR_PAGEFLAGS 23 /* __NR_PAGEFLAGS # */ ] Signed-off-by: Jeroen Hofstee Signed-off-by: Masahiro Yamada Reviewed-by: Matthias Kaehlcke Tested-by: Matthias Kaehlcke (cherry picked from commit cf0c3e68aa81f992b0301f62e341b710d385bf68) Signed-off-by: Greg Hackmann Change-Id: Ifbfd4eff59a7f4304f0d8fdcba4075100244562f --- include/linux/kbuild.h | 6 +++--- scripts/Makefile.lib | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/kbuild.h b/include/linux/kbuild.h index 22a72198c14b..4e80f3a9ad58 100644 --- a/include/linux/kbuild.h +++ b/include/linux/kbuild.h @@ -2,14 +2,14 @@ #define __LINUX_KBUILD_H #define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + asm volatile("\n.ascii \"->" #sym " %0 " #val "\"" : : "i" (val)) -#define BLANK() asm volatile("\n->" : : ) +#define BLANK() asm volatile("\n.ascii \"->\"" : : ) #define OFFSET(sym, str, mem) \ DEFINE(sym, offsetof(struct str, mem)) #define COMMENT(x) \ - asm volatile("\n->#" x) + asm volatile("\n.ascii \"->#" x "\"") #endif diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 6219c5fcb801..c3b3c94e2446 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -399,10 +399,14 @@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \ # --------------------------------------------------------------------------- # Default sed regexp - multiline due to syntax constraints +# +# Use [:space:] because LLVM's integrated assembler inserts around +# the .ascii directive whereas GCC keeps the as-is. define sed-offsets - "/^->/{s:->#\(.*\):/* \1 */:; \ + 's:^[[:space:]]*\.ascii[[:space:]]*"\(.*\)".*:\1:; \ + /^->/{s:->#\(.*\):/* \1 */:; \ s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:->::; p;}" + s:->::; p;}' endef # Use filechk to avoid rebuilds when a header changes, but the resulting file From 4b44c97fed0555b2406188ad49ac96d6be0fc329 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 13 Apr 2017 07:25:21 +0900 Subject: [PATCH 1163/3220] UPSTREAM: kbuild: drop -Wno-unknown-warning-option from clang options Since commit c3f0d0bc5b01 ("kbuild, LLVMLinux: Add -Werror to cc-option to support clang"), cc-option and friends work nicely for clang. However, -Wno-unknown-warning-option makes clang happy with any unknown warning options even if -Werror is specified. Once -Wno-unknown-warning-option is added, any succeeding call of cc-disable-warning is evaluated positive, then unknown warning options are accepted. This should be dropped. Signed-off-by: Masahiro Yamada (cherry picked from commit a0ae981eba8f07dbc74bce38fd3a462b69a5bc8e) Signed-off-by: Greg Hackmann Change-Id: I0535e20fbcecc2d431e9f08b1f274c5d96626af1 --- Makefile | 1 - scripts/Makefile.extrawarn | 1 - 2 files changed, 2 deletions(-) diff --git a/Makefile b/Makefile index 75895a4b74b8..ecc86bec38e5 100644 --- a/Makefile +++ b/Makefile @@ -704,7 +704,6 @@ endif KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) -KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,) KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable) KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index da3386a9d244..abe5f47b1ab0 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -61,7 +61,6 @@ ifeq ($(cc-name),clang) KBUILD_CFLAGS += $(call cc-disable-warning, initializer-overrides) KBUILD_CFLAGS += $(call cc-disable-warning, unused-value) KBUILD_CFLAGS += $(call cc-disable-warning, format) -KBUILD_CFLAGS += $(call cc-disable-warning, unknown-warning-option) KBUILD_CFLAGS += $(call cc-disable-warning, sign-compare) KBUILD_CFLAGS += $(call cc-disable-warning, format-zero-length) KBUILD_CFLAGS += $(call cc-disable-warning, uninitialized) From 8c4e0602b862c3f655a99e81b26cec52cbc80667 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 5 Apr 2017 14:29:32 -0700 Subject: [PATCH 1164/3220] BACKPORT: kbuild, LLVMLinux: Add -Werror to cc-option to support clang Clang will warn about unknown warnings but will not return false unless -Werror is set. GCC will return false if an unknown warning is passed. Adding -Werror make both compiler behave the same. [arnd: it turns out we need the same patch for testing whether -ffunction-sections works right with gcc. I've build tested extensively with this patch applied, so let's just merge this one now.] Upstream commit: c3f0d0bc5b01 Change-Id: I72c97bab5deaa47adef1bc535dcf19b7d2e0dbdf Signed-off-by: Mark Charlebois Signed-off-by: Behan Webster Reviewed-by: Jan-Simon Mller Signed-off-by: Arnd Bergmann Acked-by: Kees Cook Signed-off-by: Masahiro Yamada Signed-off-by: Matthias Kaehlcke Signed-off-by: Greg Hackmann --- scripts/Kbuild.include | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 1db6d73c8dd2..30d9343f0c4b 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -111,12 +111,12 @@ as-instr = $(call try-run,\ # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) cc-option = $(call try-run,\ - $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) # cc-option-yn # Usage: flag := $(call cc-option-yn,-march=winchip-c6) cc-option-yn = $(call try-run,\ - $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n) # cc-option-align # Prefix align with either -falign or -malign @@ -126,7 +126,7 @@ cc-option-align = $(subst -functions=0,,\ # cc-disable-warning # Usage: cflags-y += $(call cc-disable-warning,unused-but-set-variable) cc-disable-warning = $(call try-run,\ - $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) # cc-name # Expands to either gcc or clang From 6602531121707294674793ed27100118be4b87bc Mon Sep 17 00:00:00 2001 From: Behan Webster Date: Mon, 27 Mar 2017 18:19:09 -0700 Subject: [PATCH 1165/3220] UPSTREAM: kbuild: use -Oz instead of -Os when using clang This generates smaller resulting object code when compiled with clang. Signed-off-by: Behan Webster Signed-off-by: Matthias Kaehlcke Signed-off-by: Masahiro Yamada (cherry picked from commit 6748cb3c299de1ffbe56733647b01dbcc398c419) Signed-off-by: Greg Hackmann Change-Id: I5336ef3af6c7c638d9a68661c3c0e3f22693fdc8 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ecc86bec38e5..ce950588a177 100644 --- a/Makefile +++ b/Makefile @@ -624,7 +624,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) KBUILD_CFLAGS += $(call cc-disable-warning, int-in-bool-context) ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE -KBUILD_CFLAGS += -Os +KBUILD_CFLAGS += $(call cc-option,-Oz,-Os) else ifdef CONFIG_PROFILE_ALL_BRANCHES KBUILD_CFLAGS += -O2 From 588ae6ad5f5f31ea2c065553252a681a015a4c79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vin=C3=ADcius=20Tinti?= Date: Mon, 24 Apr 2017 13:04:58 -0700 Subject: [PATCH 1166/3220] BACKPORT: kbuild: Add support to generate LLVM assembly files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add rules to kbuild in order to generate LLVM assembly files with the .ll extension when using clang. # from c code make CC=clang kernel/pid.ll Signed-off-by: Vinícius Tinti Signed-off-by: Behan Webster Signed-off-by: Matthias Kaehlcke Signed-off-by: Masahiro Yamada (cherry picked from commit 433db3e260bc8134d4a46ddf20b3668937e12556) Signed-off-by: Greg Hackmann Change-Id: I1fcc7ec14357e19e46cc2dd1772c5c258aec91d1 --- .gitignore | 1 + Makefile | 5 +++++ scripts/Makefile.build | 8 ++++++++ 3 files changed, 14 insertions(+) diff --git a/.gitignore b/.gitignore index fa3e5f1d0808..b17dcd217d83 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ *.lzo *.patch *.gcno +*.ll modules.builtin Module.symvers *.dwo diff --git a/Makefile b/Makefile index ce950588a177..482f92e952d4 100644 --- a/Makefile +++ b/Makefile @@ -1291,6 +1291,8 @@ help: @echo ' (default: $$(INSTALL_MOD_PATH)/lib/firmware)' @echo ' dir/ - Build all files in dir and below' @echo ' dir/file.[ois] - Build specified target only' + @echo ' dir/file.ll - Build the LLVM assembly file' + @echo ' (requires compiler support for LLVM assembly generation)' @echo ' dir/file.lst - Build specified mixed source/assembly target only' @echo ' (requires a recent binutils and recent build (System.map))' @echo ' dir/file.ko - Build module including final link' @@ -1466,6 +1468,7 @@ clean: $(clean-dirs) -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \ -o -name '*.symtypes' -o -name 'modules.order' \ -o -name modules.builtin -o -name '.tmp_*.o.*' \ + -o -name '*.ll' \ -o -name '*.gcno' \) -type f -print | xargs rm -f # Generate tags for editors @@ -1569,6 +1572,8 @@ endif $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@) %.symtypes: %.c prepare scripts FORCE $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@) +%.ll: %.c prepare scripts FORCE + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@) # Modules /: prepare scripts FORCE diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 01df30af4d4a..411281c36898 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -174,6 +174,14 @@ cmd_cc_symtypes_c = \ $(obj)/%.symtypes : $(src)/%.c FORCE $(call cmd,cc_symtypes_c) +# LLVM assembly +# Generate .ll files from .c +quiet_cmd_cc_ll_c = CC $(quiet_modtag) $@ + cmd_cc_ll_c = $(CC) $(c_flags) -emit-llvm -S -o $@ $< + +$(obj)/%.ll: $(src)/%.c FORCE + $(call if_changed_dep,cc_ll_c) + # C (.c) files # The C file is compiled and updated dependency information is generated. # (See cmd_cc_o_c + relevant part of rule_cc_o_c) From b8a6c2329c3e11c6a671ef0349f709ec2a91384a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 1 Feb 2017 18:00:14 +0100 Subject: [PATCH 1167/3220] UPSTREAM: modules: mark __inittest/__exittest as __maybe_unused clang warns about unused inline functions by default: arch/arm/crypto/aes-cipher-glue.c:68:1: warning: unused function '__inittest' [-Wunused-function] arch/arm/crypto/aes-cipher-glue.c:69:1: warning: unused function '__exittest' [-Wunused-function] As these appear in every single module, let's just disable the warnings by marking the two functions as __maybe_unused. Signed-off-by: Arnd Bergmann Reviewed-by: Miroslav Benes Acked-by: Rusty Russell Signed-off-by: Jessica Yu (cherry picked from commit 1f318a8bafcfba9f0d623f4870c4e890fd22e659) Signed-off-by: Greg Hackmann Change-Id: I39c75bdb61834020320d41a678dfcc9442f07e4b --- include/linux/module.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index b229a9961d02..e2324653c8ef 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -125,13 +125,13 @@ extern void cleanup_module(void); /* Each module must use one module_init(). */ #define module_init(initfn) \ - static inline initcall_t __inittest(void) \ + static inline initcall_t __maybe_unused __inittest(void) \ { return initfn; } \ int init_module(void) __attribute__((alias(#initfn))); /* This is only required if you want to be unloadable. */ #define module_exit(exitfn) \ - static inline exitcall_t __exittest(void) \ + static inline exitcall_t __maybe_unused __exittest(void) \ { return exitfn; } \ void cleanup_module(void) __attribute__((alias(#exitfn))); From f01b0c528e75ee1e8ec13da27bd987f34b8fbe10 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Feb 2016 15:38:32 +0100 Subject: [PATCH 1168/3220] UPSTREAM: Kbuild: provide a __UNIQUE_ID for clang The default __UNIQUE_ID macro in compiler.h fails to work for some drivers: drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:615:1: error: redefinition of '__UNIQUE_ID_firmware615' BRCMF_FW_NVRAM_DEF(4354, "brcmfmac4354-sdio.bin", "brcmfmac4354-sdio.txt"); This adds a copy of the version we use for gcc-4.3 and higher, as the same one works with all versions of clang that I could find in svn (2.6 and higher). Signed-off-by: Arnd Bergmann Signed-off-by: Michal Marek (cherry picked from commit b41c29b0527c7fd6a95d0f71274abb79933bf960) Signed-off-by: Greg Hackmann Change-Id: I161dfa3ccb6b226966c3c87bba6b2fff1561bc61 --- include/linux/compiler-clang.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index d1e49d52b640..de179993e039 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -10,3 +10,8 @@ #undef uninitialized_var #define uninitialized_var(x) x = *(&(x)) #endif + +/* same as gcc, this was present in clang-2.6 so we can assume it works + * with any version that can compile the kernel + */ +#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) From 79ec10f9925af458c1cb5dfe31cbb12a7f841d1c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 6 Jun 2017 13:36:24 -0700 Subject: [PATCH 1169/3220] UPSTREAM: compiler, clang: suppress warning for unused static inline functions GCC explicitly does not warn for unused static inline functions for -Wunused-function. The manual states: Warn whenever a static function is declared but not defined or a non-inline static function is unused. Clang does warn for static inline functions that are unused. It turns out that suppressing the warnings avoids potentially complex Suppress the warning for clang. Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds (cherry picked from commit abb2ea7dfd82451d85ce669b811310c05ab5ca46) Signed-off-by: Greg Hackmann Change-Id: I68e6246b03c962cc87b9d0bf4b7fefeda27068c0 --- include/linux/compiler-clang.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index de179993e039..ea9126006a69 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -15,3 +15,10 @@ * with any version that can compile the kernel */ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) + +/* + * GCC does not warn about unused static inline functions for + * -Wunused-function. This turns out to avoid the need for complex #ifdef + * directives. Suppress the warning in clang as well. + */ +#define inline inline __attribute__((unused)) From f2ea3999ecf1108b3ae0dcc14eab420701e4cf0a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 11 Jun 2017 15:51:56 -0700 Subject: [PATCH 1170/3220] UPSTREAM: compiler, clang: properly override 'inline' for clang Commit abb2ea7dfd82 ("compiler, clang: suppress warning for unused static inline functions") just caused more warnings due to re-defining the 'inline' macro. So undef it before re-defining it, and also add the 'notrace' attribute like the gcc version that this is overriding does. Maybe this makes clang happier. Signed-off-by: Linus Torvalds (cherry picked from commit 6d53cefb18e4646fb4bf62ccb6098fb3808486df) Signed-off-by: Greg Hackmann Change-Id: Ie01b45583954c6104c854a3810e35c1171764e78 --- include/linux/compiler-clang.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index ea9126006a69..d614c5ea1b5e 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -21,4 +21,5 @@ * -Wunused-function. This turns out to avoid the need for complex #ifdef * directives. Suppress the warning in clang as well. */ -#define inline inline __attribute__((unused)) +#undef inline +#define inline inline __attribute__((unused)) notrace From 3717411e828d8af7428987070f14819d9d809636 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 13 Apr 2017 10:26:09 -0700 Subject: [PATCH 1171/3220] UPSTREAM: x86/kbuild: Use cc-option to enable -falign-{jumps/loops} clang currently does not support these optimizations, only enable them when they are available. Signed-off-by: Matthias Kaehlcke Cc: Greg Hackmann Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Michael Davidson Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: grundler@chromium.org Link: http://lkml.kernel.org/r/20170413172609.118122-1-mka@chromium.org Signed-off-by: Ingo Molnar (cherry picked from commit 2c4fd1ac3ff167c91272dc43c7bfd2269ef61557) Signed-off-by: Greg Hackmann Change-Id: Id040421dcf782c9a5b20a72cf68360b36da8f824 --- arch/x86/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 53949c886341..360713b8e258 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -88,10 +88,10 @@ else KBUILD_CFLAGS += -m64 # Align jump targets to 1 byte, not the default 16 bytes: - KBUILD_CFLAGS += -falign-jumps=1 + KBUILD_CFLAGS += $(call cc-option,-falign-jumps=1) # Pack loops tightly as well: - KBUILD_CFLAGS += -falign-loops=1 + KBUILD_CFLAGS += $(call cc-option,-falign-loops=1) # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += $(call cc-option,-mno-80387) From 2a3a3862527dbdba157c5973cbd231a20b20d253 Mon Sep 17 00:00:00 2001 From: Michael Davidson Date: Wed, 15 Mar 2017 15:36:00 -0700 Subject: [PATCH 1172/3220] UPSTREAM: crypto, x86: aesni - fix token pasting for clang aes_ctrby8_avx-x86_64.S uses the C preprocessor for token pasting of character sequences that are not valid preprocessor tokens. While this is allowed when preprocessing assembler files it exposes an incompatibilty between the clang and gcc preprocessors where clang does not strip leading white space from macro parameters, leading to the CONCAT(%xmm, i) macro expansion on line 96 resulting in a token with a space character embedded in it. While this could be resolved by deleting the offending space character, the assembler is perfectly capable of doing the token pasting correctly for itself so we can just get rid of the preprocessor macros. Signed-off-by: Michael Davidson Signed-off-by: Herbert Xu (cherry picked from commit fdb2726f4e61c5e3abc052f547d5a5f6c0dc5504) Signed-off-by: Greg Hackmann Change-Id: I087414d3575ea7b8703f39d429ccbf0361b314ae --- arch/x86/crypto/aes_ctrby8_avx-x86_64.S | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index a916c4a61165..5f6a5af9c489 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -65,7 +65,6 @@ #include #include -#define CONCAT(a,b) a##b #define VMOVDQ vmovdqu #define xdata0 %xmm0 @@ -92,8 +91,6 @@ #define num_bytes %r8 #define tmp %r10 -#define DDQ(i) CONCAT(ddq_add_,i) -#define XMM(i) CONCAT(%xmm, i) #define DDQ_DATA 0 #define XDATA 1 #define KEY_128 1 @@ -131,12 +128,12 @@ ddq_add_8: /* generate a unique variable for ddq_add_x */ .macro setddq n - var_ddq_add = DDQ(\n) + var_ddq_add = ddq_add_\n .endm /* generate a unique variable for xmm register */ .macro setxdata n - var_xdata = XMM(\n) + var_xdata = %xmm\n .endm /* club the numeric 'id' to the symbol 'name' */ From cafc5bc76374ba807d570055be4e217572d33a49 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Fri, 5 May 2017 09:50:49 -0700 Subject: [PATCH 1173/3220] BACKPORT: x86/mm/kaslr: Use the _ASM_MUL macro for multiplication to work around Clang incompatibility The constraint "rm" allows the compiler to put mix_const into memory. When the input operand is a memory location then MUL needs an operand size suffix, since Clang can't infer the multiplication width from the operand. Add and use the _ASM_MUL macro which determines the operand size and resolves to the NUL instruction with the corresponding suffix. This fixes the following error when building with clang: CC arch/x86/lib/kaslr.o /tmp/kaslr-dfe1ad.s: Assembler messages: /tmp/kaslr-dfe1ad.s:182: Error: no instruction mnemonic suffix given and no register operands; can't size instruction Upstream commit: 121843eb02a6 Change-Id: I53f51839705dabeb6c950d1def3a45881294129c Signed-off-by: Matthias Kaehlcke Cc: Grant Grundler Cc: Greg Hackmann Cc: Kees Cook Cc: Linus Torvalds Cc: Michael Davidson Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170501224741.133938-1-mka@chromium.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Hackmann --- arch/x86/boot/compressed/aslr.c | 2 +- arch/x86/include/asm/asm.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 6a9b96b4624d..4a5fbd2da658 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -94,7 +94,7 @@ static unsigned long get_random_long(void) } /* Circular multiply for better bit diffusion */ - asm("mul %3" + asm(_ASM_MUL "%3" : "=a" (random), "=d" (raw) : "a" (random), "rm" (mix_const)); random += raw; diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 189679aba703..1f16ec50abeb 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -32,6 +32,7 @@ #define _ASM_ADD __ASM_SIZE(add) #define _ASM_SUB __ASM_SIZE(sub) #define _ASM_XADD __ASM_SIZE(xadd) +#define _ASM_MUL __ASM_SIZE(mul) #define _ASM_AX __ASM_REG(ax) #define _ASM_BX __ASM_REG(bx) From fd5df2a435a5f2015b3b11504574e500ca3ab452 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 May 2016 12:56:27 +0200 Subject: [PATCH 1174/3220] BACKPORT: x86/hweight: Get rid of the special calling convention People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench into kcov, lto, etc, experimentations. Add asm versions for __sw_hweight{32,64}() and do explicit saving and restoring of clobbered registers. This gets rid of the special calling convention. We get to call those functions on !X86_FEATURE_POPCNT CPUs. We still need to hardcode POPCNT and register operands as some old gas versions which we support, do not know about POPCNT. Btw, remove redundant REX prefix from 32-bit POPCNT because alternatives can do padding now. Suggested-by: H. Peter Anvin Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464605787-20603-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar (cherry picked from commit f5967101e9de12addcda4510dfbac66d7c5779c3) Signed-off-by: Matthias Kaehlcke Signed-off-by: Greg Hackmann Conflicts: lib/Makefile Change-Id: Ie7e6dce51c7093b1162337ec8bfc5abde0d79688 --- arch/x86/Kconfig | 5 -- arch/x86/include/asm/arch_hweight.h | 24 ++++----- arch/x86/kernel/i386_ksyms_32.c | 2 + arch/x86/kernel/x8664_ksyms_64.c | 3 ++ arch/x86/lib/Makefile | 2 +- arch/x86/lib/hweight.S | 77 +++++++++++++++++++++++++++++ lib/Makefile | 2 - lib/hweight.c | 4 ++ 8 files changed, 97 insertions(+), 22 deletions(-) create mode 100644 arch/x86/lib/hweight.S diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc9dbc5aa80f..412166ac999f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -296,11 +296,6 @@ config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR -config ARCH_HWEIGHT_CFLAGS - string - default "-fcall-saved-ecx -fcall-saved-edx" if X86_32 - default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 - config ARCH_SUPPORTS_UPROBES def_bool y diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 259a7c1ef709..44f825c80ed5 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -2,8 +2,8 @@ #define _ASM_X86_HWEIGHT_H #ifdef CONFIG_64BIT -/* popcnt %edi, %eax -- redundant REX prefix for alignment */ -#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7" +/* popcnt %edi, %eax */ +#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7" /* popcnt %rdi, %rax */ #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7" #define REG_IN "D" @@ -15,19 +15,15 @@ #define REG_OUT "a" #endif -/* - * __sw_hweightXX are called from within the alternatives below - * and callee-clobbered registers need to be taken care of. See - * ARCH_HWEIGHT_CFLAGS in for the respective - * compiler switches. - */ +#define __HAVE_ARCH_SW_HWEIGHT + static __always_inline unsigned int __arch_hweight32(unsigned int w) { - unsigned int res = 0; + unsigned int res; asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + : "="REG_OUT (res) + : REG_IN (w)); return res; } @@ -51,11 +47,11 @@ static inline unsigned long __arch_hweight64(__u64 w) #else static __always_inline unsigned long __arch_hweight64(__u64 w) { - unsigned long res = 0; + unsigned long res; asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + : "="REG_OUT (res) + : REG_IN (w)); return res; } diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 64341aa485ae..d40ee8a38fed 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(___preempt_schedule); EXPORT_SYMBOL(___preempt_schedule_notrace); #endif + +EXPORT_SYMBOL(__sw_hweight32); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a0695be19864..c7efd394c42b 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -42,6 +42,9 @@ EXPORT_SYMBOL(clear_page); EXPORT_SYMBOL(csum_partial); +EXPORT_SYMBOL(__sw_hweight32); +EXPORT_SYMBOL(__sw_hweight64); + /* * Export string functions. We normally rely on gcc builtin for most of these, * but gcc sometimes decides not to inline them. diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f2587888d987..db4a1c6ea785 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -22,7 +22,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o -obj-y += msr.o msr-reg.o msr-reg-export.o +obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S new file mode 100644 index 000000000000..02de3d74d2c5 --- /dev/null +++ b/arch/x86/lib/hweight.S @@ -0,0 +1,77 @@ +#include + +#include + +/* + * unsigned int __sw_hweight32(unsigned int w) + * %rdi: w + */ +ENTRY(__sw_hweight32) + +#ifdef CONFIG_X86_64 + movl %edi, %eax # w +#endif + __ASM_SIZE(push,) %__ASM_REG(dx) + movl %eax, %edx # w -> t + shrl %edx # t >>= 1 + andl $0x55555555, %edx # t &= 0x55555555 + subl %edx, %eax # w -= t + + movl %eax, %edx # w -> t + shrl $2, %eax # w_tmp >>= 2 + andl $0x33333333, %edx # t &= 0x33333333 + andl $0x33333333, %eax # w_tmp &= 0x33333333 + addl %edx, %eax # w = w_tmp + t + + movl %eax, %edx # w -> t + shrl $4, %edx # t >>= 4 + addl %edx, %eax # w_tmp += t + andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f + imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101 + shrl $24, %eax # w = w_tmp >> 24 + __ASM_SIZE(pop,) %__ASM_REG(dx) + ret +ENDPROC(__sw_hweight32) + +ENTRY(__sw_hweight64) +#ifdef CONFIG_X86_64 + pushq %rdx + + movq %rdi, %rdx # w -> t + movabsq $0x5555555555555555, %rax + shrq %rdx # t >>= 1 + andq %rdx, %rax # t &= 0x5555555555555555 + movabsq $0x3333333333333333, %rdx + subq %rax, %rdi # w -= t + + movq %rdi, %rax # w -> t + shrq $2, %rdi # w_tmp >>= 2 + andq %rdx, %rax # t &= 0x3333333333333333 + andq %rdi, %rdx # w_tmp &= 0x3333333333333333 + addq %rdx, %rax # w = w_tmp + t + + movq %rax, %rdx # w -> t + shrq $4, %rdx # t >>= 4 + addq %rdx, %rax # w_tmp += t + movabsq $0x0f0f0f0f0f0f0f0f, %rdx + andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f + movabsq $0x0101010101010101, %rdx + imulq %rdx, %rax # w_tmp *= 0x0101010101010101 + shrq $56, %rax # w = w_tmp >> 56 + + popq %rdx + ret +#else /* CONFIG_X86_32 */ + /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ + pushl %ecx + + call __sw_hweight32 + movl %eax, %ecx # stash away result + movl %edx, %eax # second part of input + call __sw_hweight32 + addl %ecx, %eax # result + + popl %ecx + ret +#endif +ENDPROC(__sw_hweight64) diff --git a/lib/Makefile b/lib/Makefile index 7f1de26613d2..cb4f6aa95013 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -58,8 +58,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -GCOV_PROFILE_hweight.o := n -CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o diff --git a/lib/hweight.c b/lib/hweight.c index 9a5c1f221558..43273a7d83cf 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,6 +9,7 @@ * The Hamming Weight of a number is the total number of bits set in it. */ +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned int __sw_hweight32(unsigned int w) { #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER @@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w) #endif } EXPORT_SYMBOL(__sw_hweight32); +#endif unsigned int __sw_hweight16(unsigned int w) { @@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w) } EXPORT_SYMBOL(__sw_hweight8); +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 @@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w) #endif } EXPORT_SYMBOL(__sw_hweight64); +#endif From 3afdf761c528f1407cb1e5facb258619f55f3b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 8 Aug 2016 20:35:29 +0300 Subject: [PATCH 1175/3220] UPSTREAM: x86/hweight: Don't clobber %rdi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The caller expects %rdi to remain intact, push+pop it make that happen. Fixes the following kind of explosions on my core2duo machine when trying to reboot or shut down: general protection fault: 0000 [#1] PREEMPT SMP Modules linked in: i915 i2c_algo_bit drm_kms_helper cfbfillrect syscopyarea cfbimgblt sysfillrect sysimgblt fb_sys_fops cfbcopyarea drm netconsole configfs binfmt_misc iTCO_wdt psmouse pcspkr snd_hda_codec_idt e100 coretemp hwmon snd_hda_codec_generic i2c_i801 mii i2c_smbus lpc_ich mfd_core snd_hda_intel uhci_hcd snd_hda_codec snd_hwdep snd_hda_core ehci_pci 8250 ehci_hcd snd_pcm 8250_base usbcore evdev serial_core usb_common parport_pc parport snd_timer snd soundcore CPU: 0 PID: 3070 Comm: reboot Not tainted 4.8.0-rc1-perf-dirty #69 Hardware name: /D946GZIS, BIOS TS94610J.86A.0087.2007.1107.1049 11/07/2007 task: ffff88012a0b4080 task.stack: ffff880123850000 RIP: 0010:[] [] x86_perf_event_update+0x52/0xc0 RSP: 0018:ffff880123853b60 EFLAGS: 00010087 RAX: 0000000000000001 RBX: ffff88012fc0a3c0 RCX: 000000000000001e RDX: 0000000000000000 RSI: 0000000040000000 RDI: ffff88012b014800 RBP: ffff880123853b88 R08: ffffffffffffffff R09: 0000000000000000 R10: ffffea0004a012c0 R11: ffffea0004acedc0 R12: ffffffff80000001 R13: ffff88012b0149c0 R14: ffff88012b014800 R15: 0000000000000018 FS: 00007f8b155cd700(0000) GS:ffff88012fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8b155f5000 CR3: 000000012a2d7000 CR4: 00000000000006f0 Stack: ffff88012fc0a3c0 ffff88012b014800 0000000000000004 0000000000000001 ffff88012fc1b750 ffff880123853bb0 ffffffff81003d59 ffff88012b014800 ffff88012fc0a3c0 ffff88012b014800 ffff880123853bd8 ffffffff81003e13 Call Trace: [] x86_pmu_stop+0x59/0xd0 [] x86_pmu_del+0x43/0x140 [] event_sched_out.isra.105+0xbd/0x260 [] __perf_remove_from_context+0x2d/0xb0 [] __perf_event_exit_context+0x4d/0x70 [] generic_exec_single+0xb6/0x140 [] ? __perf_remove_from_context+0xb0/0xb0 [] ? __perf_remove_from_context+0xb0/0xb0 [] smp_call_function_single+0xdf/0x140 [] perf_event_exit_cpu_context+0x87/0xc0 [] perf_reboot+0x13/0x40 [] notifier_call_chain+0x4a/0x70 [] __blocking_notifier_call_chain+0x47/0x60 [] blocking_notifier_call_chain+0x16/0x20 [] kernel_restart_prepare+0x1d/0x40 [] kernel_restart+0x12/0x60 [] SYSC_reboot+0xf6/0x1b0 [] ? mntput_no_expire+0x2c/0x1b0 [] ? mntput+0x24/0x40 [] ? __fput+0x16c/0x1e0 [] ? ____fput+0xe/0x10 [] ? task_work_run+0x83/0xa0 [] ? exit_to_usermode_loop+0x53/0xc0 [] ? trace_hardirqs_on_thunk+0x1a/0x1c [] SyS_reboot+0xe/0x10 [] entry_SYSCALL_64_fastpath+0x18/0xa3 Code: 7c 4c 8d af c0 01 00 00 49 89 fe eb 10 48 09 c2 4c 89 e0 49 0f b1 55 00 4c 39 e0 74 35 4d 8b a6 c0 01 00 00 41 8b 8e 60 01 00 00 <0f> 33 8b 35 6e 02 8c 00 48 c1 e2 20 85 f6 7e d2 48 89 d3 89 cf RIP [] x86_perf_event_update+0x52/0xc0 RSP ---[ end trace 7ec95181faf211be ]--- note: reboot[3070] exited with preempt_count 2 Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Fixes: f5967101e9de ("x86/hweight: Get rid of the special calling convention") Signed-off-by: Ville Syrjälä Signed-off-by: Linus Torvalds (cherry picked from commit 65ea11ec6a82b1d44aba62b59e9eb20247e57c6e) Signed-off-by: Greg Hackmann Change-Id: Ib004aa044ba9fc73cfff97fe78c8607008ca3846 --- arch/x86/lib/hweight.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index 02de3d74d2c5..8a602a1e404a 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -35,6 +35,7 @@ ENDPROC(__sw_hweight32) ENTRY(__sw_hweight64) #ifdef CONFIG_X86_64 + pushq %rdi pushq %rdx movq %rdi, %rdx # w -> t @@ -60,6 +61,7 @@ ENTRY(__sw_hweight64) shrq $56, %rax # w = w_tmp >> 56 popq %rdx + popq %rdi ret #else /* CONFIG_X86_32 */ /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ From 423ba0fede66c9e367d2bab64339f85d6ce1d6f8 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 21 Jun 2017 16:28:03 -0700 Subject: [PATCH 1176/3220] BACKPORT: kbuild: Add __cc-option macro cc-option uses KBUILD_CFLAGS and KBUILD_CPPFLAGS when it determines whether an option is supported or not. This is fine for options used to build the kernel itself, however some components like the x86 boot code use a different set of flags. Add the new macro __cc-option which is a more generic version of cc-option with additional parameters. One parameter is the compiler with which the check should be performed, the other the compiler options to be used instead KBUILD_C*FLAGS. Refactor cc-option and hostcc-option to use __cc-option and move hostcc-option to scripts/Kbuild.include. Suggested-by: Arnd Bergmann Suggested-by: Masahiro Yamada Signed-off-by: Matthias Kaehlcke Acked-by: Arnd Bergmann Acked-by: Michal Marek Signed-off-by: Masahiro Yamada (cherry picked from commit 9f3f1fd299768782465cb32cdf0dd4528d11f26b) Signed-off-by: Greg Hackmann Conflicts: scripts/Kbuild.include Change-Id: I4c8288b9c74bd6b9199307a0e04b78a27e28361d --- Makefile | 2 +- scripts/Kbuild.include | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 482f92e952d4..9f32fb972fb6 100644 --- a/Makefile +++ b/Makefile @@ -301,7 +301,7 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ HOSTCC = gcc HOSTCXX = g++ -HOSTCFLAGS = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 +HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 HOSTCXXFLAGS = -O2 ifeq ($(shell $(HOSTCC) -v 2>&1 | grep -c "clang version"), 1) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 30d9343f0c4b..8a1bb64f1dcd 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -107,11 +107,25 @@ as-option = $(call try-run,\ as-instr = $(call try-run,\ printf "%b\n" "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3)) +# __cc-option +# Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586) +__cc-option = $(call try-run,\ + $(1) -Werror $(2) $(3) -c -x c /dev/null -o "$$TMP",$(3),$(4)) + +# Do not attempt to build with gcc plugins during cc-option tests. +# (And this uses delayed resolution so the flags will be up to date.) +CC_OPTION_CFLAGS = $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) + # cc-option # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) -cc-option = $(call try-run,\ - $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) +cc-option = $(call __cc-option, $(CC),\ + $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS),$(1),$(2)) + +# hostcc-option +# Usage: cflags-y += $(call hostcc-option,-march=winchip-c6,-march=i586) +hostcc-option = $(call __cc-option, $(HOSTCC),\ + $(HOSTCFLAGS) $(HOST_EXTRACFLAGS),$(1),$(2)) # cc-option-yn # Usage: flag := $(call cc-option-yn,-march=winchip-c6) From 203bd5f980384f6c5449191a42865950e5d86b3a Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 21 Jun 2017 16:28:04 -0700 Subject: [PATCH 1177/3220] UPSTREAM: x86/build: Use __cc-option for boot code compiler options cc-option is used to enable compiler options for the boot code if they are available. The macro uses KBUILD_CFLAGS and KBUILD_CPPFLAGS for the check, however these flags aren't used to build the boot code, in consequence cc-option can yield wrong results. For example -mpreferred-stack-boundary=2 is never set with a 64-bit compiler, since the setting is only valid for 16 and 32-bit binaries. This is also the case for 32-bit kernel builds, because the option -m32 is added to KBUILD_CFLAGS after the assignment of REALMODE_CFLAGS. Use __cc-option instead of cc-option for the boot mode options. The macro receives the compiler options as parameter instead of using KBUILD_C*FLAGS, for the boot code we pass REALMODE_CFLAGS. Also use separate statements for the __cc-option checks instead of performing them in the initial assignment of REALMODE_CFLAGS since the variable is an input of the macro. Signed-off-by: Matthias Kaehlcke Acked-by: Ingo Molnar Signed-off-by: Masahiro Yamada (cherry picked commit 032a2c4f65a2f81c93e161a11197ba19bc14a909) Signed-off-by: Greg Hackmann Change-Id: I7756f875771edb00238eb770be912f713407681a --- arch/x86/Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 360713b8e258..e3b8c237b828 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -24,10 +24,11 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-stack-protector) \ - $(call cc-option, -mpreferred-stack-boundary=2) + -mno-mmx -mno-sse + +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -mpreferred-stack-boundary=2) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit From 6613aeae39d692d5a1f0432b76184b7ef84b4072 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 21 Jun 2017 16:28:05 -0700 Subject: [PATCH 1178/3220] UPSTREAM: x86/build: Specify stack alignment for clang For gcc stack alignment is configured with -mpreferred-stack-boundary=N, clang has the option -mstack-alignment=N for that purpose. Use the same alignment as with gcc. If the alignment is not specified clang assumes an alignment of 16 bytes, as required by the standard ABI. However as mentioned in d9b0cde91c60 ("x86-64, gcc: Use -mpreferred-stack-boundary=3 if supported") the standard kernel entry on x86-64 leaves the stack on an 8-byte boundary, as a consequence clang will keep the stack misaligned. Signed-off-by: Matthias Kaehlcke Acked-by: Ingo Molnar Signed-off-by: Masahiro Yamada (cherry picked commit d77698df39a512911586834d303275ea5fda74d0) Signed-off-by: Greg Hackmann Change-Id: I4283d10c6fe31cf194b35adc5371732b89eb3ae3 --- arch/x86/Makefile | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index e3b8c237b828..cb6af304d456 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -11,6 +11,14 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif +# For gcc stack alignment is specified with -mpreferred-stack-boundary, +# clang has the option -mstack-alignment for that purpose. +ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) + cc_stack_align_opt := -mpreferred-stack-boundary +else ifneq ($(call cc-option, -mstack-alignment=4),) + cc_stack_align_opt := -mstack-alignment +endif + # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. # @@ -28,7 +36,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) -REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -mpreferred-stack-boundary=2) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -65,8 +73,10 @@ ifeq ($(CONFIG_X86_32),y) # with nonstandard options KBUILD_CFLAGS += -fno-pic - # prevent gcc from keeping the stack 16 byte aligned - KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) + # Align the stack to the register width instead of using the default + # alignment of 16 bytes. This reduces stack usage and the number of + # alignment instructions. + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -100,8 +110,14 @@ else KBUILD_CFLAGS += -fno-pic - # Use -mpreferred-stack-boundary=3 if supported. - KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) + # By default gcc and clang use a stack alignment of 16 bytes for x86. + # However the standard kernel entry on x86-64 leaves the stack on an + # 8-byte boundary. If the compiler isn't informed about the actual + # alignment it will generate extra alignment instructions for the + # default alignment which keep the stack *mis*aligned. + # Furthermore an alignment to the register width reduces stack usage + # and the number of alignment instructions. + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) From ea2f9b02ed2fb2a0ab3d0f3d427db27ef550dab2 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Fri, 21 Apr 2017 14:39:30 -0700 Subject: [PATCH 1179/3220] UPSTREAM: kbuild: clang: Disable 'address-of-packed-member' warning clang generates plenty of these warnings in different parts of the code, to an extent that the warnings are little more than noise. Disable the 'address-of-packed-member' warning. Signed-off-by: Matthias Kaehlcke Reviewed-by: Douglas Anderson Signed-off-by: Masahiro Yamada (cherry picked from commit bfb38988c51e440fd7062ddf3157f7d8b1dd5d70) Signed-off-by: Greg Hackmann Change-Id: I35ecf1b35a908d41ee791a8a651e3cfb4edd081b --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 9f32fb972fb6..c8d44abe2df2 100644 --- a/Makefile +++ b/Makefile @@ -707,6 +707,7 @@ KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable) KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) # Quiet clang warning: comparison of unsigned expression < 0 is always false KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) # CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the From b9c115c89fcc8c3d8038cd7d14f8e754cebbcb93 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 26 Apr 2017 17:11:32 +0100 Subject: [PATCH 1180/3220] UPSTREAM: crypto: arm64/sha - avoid non-standard inline asm tricks Replace the inline asm which exports struct offsets as ELF symbols with proper const variables exposing the same values. This works around an issue with Clang which does not interpret the "i" (or "I") constraints in the same way as GCC. Signed-off-by: Ard Biesheuvel Tested-by: Matthias Kaehlcke Signed-off-by: Herbert Xu (cherry picked from commit f4857f4c2ee9aa4e2aacac1a845352b00197fb57) Signed-off-by: Greg Hackmann Change-Id: I1f882de15bd447d6fc41858dfc0cbfd3f6e2466c --- arch/arm64/crypto/sha1-ce-core.S | 6 ++++-- arch/arm64/crypto/sha1-ce-glue.c | 11 +++-------- arch/arm64/crypto/sha2-ce-core.S | 6 ++++-- arch/arm64/crypto/sha2-ce-glue.c | 13 +++++-------- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index c98e7e849f06..8550408735a0 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -82,7 +82,8 @@ ENTRY(sha1_ce_transform) ldr dgb, [x0, #16] /* load sha1_ce_state::finalize */ - ldr w4, [x0, #:lo12:sha1_ce_offsetof_finalize] + ldr_l w4, sha1_ce_offsetof_finalize, x4 + ldr w4, [x0, x4] /* load input */ 0: ld1 {v8.4s-v11.4s}, [x1], #64 @@ -132,7 +133,8 @@ CPU_LE( rev32 v11.16b, v11.16b ) * the padding is handled by the C code in that case. */ cbz x4, 3f - ldr x4, [x0, #:lo12:sha1_ce_offsetof_count] + ldr_l w4, sha1_ce_offsetof_count, x4 + ldr x4, [x0, x4] movi v9.2d, #0 mov x8, #0x80000000 movi v10.2d, #0 diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index aefda9868627..ea319c055f5d 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -17,9 +17,6 @@ #include #include -#define ASM_EXPORT(sym, val) \ - asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val)); - MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); @@ -32,6 +29,9 @@ struct sha1_ce_state { asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, int blocks); +const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count); +const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize); + static int sha1_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { @@ -52,11 +52,6 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, struct sha1_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE); - ASM_EXPORT(sha1_ce_offsetof_count, - offsetof(struct sha1_ce_state, sst.count)); - ASM_EXPORT(sha1_ce_offsetof_finalize, - offsetof(struct sha1_ce_state, finalize)); - /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S index 01cfee066837..679c6c002f4f 100644 --- a/arch/arm64/crypto/sha2-ce-core.S +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -88,7 +88,8 @@ ENTRY(sha2_ce_transform) ld1 {dgav.4s, dgbv.4s}, [x0] /* load sha256_ce_state::finalize */ - ldr w4, [x0, #:lo12:sha256_ce_offsetof_finalize] + ldr_l w4, sha256_ce_offsetof_finalize, x4 + ldr w4, [x0, x4] /* load input */ 0: ld1 {v16.4s-v19.4s}, [x1], #64 @@ -136,7 +137,8 @@ CPU_LE( rev32 v19.16b, v19.16b ) * the padding is handled by the C code in that case. */ cbz x4, 3f - ldr x4, [x0, #:lo12:sha256_ce_offsetof_count] + ldr_l w4, sha256_ce_offsetof_count, x4 + ldr x4, [x0, x4] movi v17.2d, #0 mov x8, #0x80000000 movi v18.2d, #0 diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 7cd587564a41..0ed9486f75dd 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -17,9 +17,6 @@ #include #include -#define ASM_EXPORT(sym, val) \ - asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val)); - MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); @@ -32,6 +29,11 @@ struct sha256_ce_state { asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, int blocks); +const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state, + sst.count); +const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state, + finalize); + static int sha256_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { @@ -52,11 +54,6 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, struct sha256_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE); - ASM_EXPORT(sha256_ce_offsetof_count, - offsetof(struct sha256_ce_state, sst.count)); - ASM_EXPORT(sha256_ce_offsetof_finalize, - offsetof(struct sha256_ce_state, finalize)); - /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. From e221e075ddd44d40fc4a51613c667168752e0051 Mon Sep 17 00:00:00 2001 From: Michael Davidson Date: Mon, 24 Jul 2017 16:51:55 -0700 Subject: [PATCH 1181/3220] UPSTREAM: x86/boot: #undef memcpy() et al in string.c undef memcpy() and friends in boot/string.c so that the functions defined here will have the correct names, otherwise we end up up trying to redefine __builtin_memcpy() etc. Surprisingly, GCC allows this (and, helpfully, discards the __builtin_ prefix from the function name when compiling it), but clang does not. Adding these #undef's appears to preserve what I assume was the original intent of the code. (cherry picked from commit 18d5e6c34a8eda438d5ad8b3b15f42dab01bf05d) Change-Id: I616a6a8ece533166367d987597e8c405c96441a2 Signed-off-by: Michael Davidson Signed-off-by: Matthias Kaehlcke Acked-by: H. Peter Anvin Cc: Arnd Bergmann Cc: Bernhard.Rosenkranzer@linaro.org Cc: Greg Hackmann Cc: Kees Cook Cc: Linus Torvalds Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170724235155.79255-1-mka@chromium.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Hackmann --- arch/x86/boot/string.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 06ceddb3a22e..1d56adea8a7c 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -16,6 +16,15 @@ #include "ctype.h" #include "string.h" +/* + * Undef these macros so that the functions that we provide + * here will have the correct names regardless of how string.h + * may have chosen to #define them. + */ +#undef memcpy +#undef memset +#undef memcmp + int memcmp(const void *s1, const void *s2, size_t len) { u8 diff; From 75eb3438b28512e33877354349d01f2dd659f0e6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 6 Jul 2017 15:35:24 -0700 Subject: [PATCH 1182/3220] UPSTREAM: compiler, clang: always inline when CONFIG_OPTIMIZE_INLINING is disabled The motivation for commit abb2ea7dfd82 ("compiler, clang: suppress warning for unused static inline functions") was to suppress clang's warnings about unused static inline functions. For configs without CONFIG_OPTIMIZE_INLINING enabled, such as any non-x86 architecture, `inline' in the kernel implies that __attribute__((always_inline)) is used. Some code depends on that behavior, see https://lkml.org/lkml/2017/6/13/918: net/built-in.o: In function `__xchg_mb': arch/arm64/include/asm/cmpxchg.h:99: undefined reference to `__compiletime_assert_99' arch/arm64/include/asm/cmpxchg.h:99: undefined reference to `__compiletime_assert_99 The full fix would be to identify these breakages and annotate the functions with __always_inline instead of `inline'. But since we are late in the 4.12-rc cycle, simply carry forward the forced inlining behavior and work toward moving arm64, and other architectures, toward CONFIG_OPTIMIZE_INLINING behavior. (cherry picked from commit 9a04dbcfb33b4012d0ce8c0282f1e3ca694675b1) Change-Id: I13891c2f1e588d8c7febe5d2d57134abb31d6ecd Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1706261552200.1075@chino.kir.corp.google.com Signed-off-by: David Rientjes Reported-by: Sodagudi Prasad Tested-by: Sodagudi Prasad Tested-by: Matthias Kaehlcke Cc: Mark Rutland Cc: Will Deacon Cc: Catalin Marinas Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Hackmann --- include/linux/compiler-clang.h | 8 -------- include/linux/compiler-gcc.h | 18 +++++++++++------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index d614c5ea1b5e..de179993e039 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -15,11 +15,3 @@ * with any version that can compile the kernel */ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -/* - * GCC does not warn about unused static inline functions for - * -Wunused-function. This turns out to avoid the need for complex #ifdef - * directives. Suppress the warning in clang as well. - */ -#undef inline -#define inline inline __attribute__((unused)) notrace diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 287e698c28de..557dae96ce74 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -66,18 +66,22 @@ /* * Force always-inline if the user requests it so via the .config, - * or if gcc is too old: + * or if gcc is too old. + * GCC does not warn about unused static inline functions for + * -Wunused-function. This turns out to avoid the need for complex #ifdef + * directives. Suppress the warning in clang as well by using "unused" + * function attribute, which is redundant but not harmful for gcc. */ #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4) -#define inline inline __attribute__((always_inline)) notrace -#define __inline__ __inline__ __attribute__((always_inline)) notrace -#define __inline __inline __attribute__((always_inline)) notrace +#define inline inline __attribute__((always_inline,unused)) notrace +#define __inline__ __inline__ __attribute__((always_inline,unused)) notrace +#define __inline __inline __attribute__((always_inline,unused)) notrace #else /* A lot of inline functions can cause havoc with function tracing */ -#define inline inline notrace -#define __inline__ __inline__ notrace -#define __inline __inline notrace +#define inline inline __attribute__((unused)) notrace +#define __inline__ __inline__ __attribute__((unused)) notrace +#define __inline __inline __attribute__((unused)) notrace #endif #define __always_inline inline __attribute__((always_inline)) From 00b6078ea64bfbbc539a8924a328053cc6d16114 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 18 Aug 2017 20:49:36 +0100 Subject: [PATCH 1183/3220] BACKPORT: efi/libstub/arm64: Force 'hidden' visibility for section markers To prevent the compiler from emitting absolute references to the section markers when running in PIC mode, override the visibility to 'hidden' for all contents of asm/sections.h Tested-by: Matthias Kaehlcke Signed-off-by: Ard Biesheuvel Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20170818194947.19347-4-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit 0426a4e68f18d75515414361de9e3e1445d2644e) Signed-off-by: Greg Hackmann Change-Id: Ia438c3f0aa6abdbd9057dfe1db732a25aa98ef40 --- drivers/firmware/efi/libstub/arm64-stub.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index e0e6b74fef8f..1b78215d6cd2 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -9,9 +9,17 @@ * published by the Free Software Foundation. * */ + +/* + * To prevent the compiler from emitting GOT-indirected (and thus absolute) + * references to the section markers, override their visibility as 'hidden' + */ +#pragma GCC visibility push(hidden) +#include +#pragma GCC visibility pop + #include #include -#include #include "efistub.h" From 2f2860a504a30a7645c6a0ec06767c5c7677a4ea Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 18 Aug 2017 20:49:37 +0100 Subject: [PATCH 1184/3220] UPSTREAM: efi/libstub/arm64: Set -fpie when building the EFI stub Clang may emit absolute symbol references when building in non-PIC mode, even when using the default 'small' code model, which is already mostly position independent to begin with, due to its use of adrp/add pairs that have a relative range of +/- 4 GB. The remedy is to pass the -fpie flag, which can be done safely now that the code has been updated to avoid GOT indirections (which may be emitted due to the compiler assuming that the PIC/PIE code may end up in a shared library that is subject to ELF symbol preemption) Passing -fpie when building code that needs to execute at an a priori unknown offset is arguably an improvement in any case, and given that the recent visibility changes allow the PIC build to pass with GCC as well, let's add -fpie for all arm64 builds rather than only for Clang. Tested-by: Matthias Kaehlcke Signed-off-by: Ard Biesheuvel Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20170818194947.19347-5-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit 91ee5b21ee026c49e4e7483de69b55b8b47042be) Signed-off-by: Greg Hackmann Change-Id: I0a011945239d39a2d1eb04c20bf1b9ceb7d2b91d --- drivers/firmware/efi/libstub/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 68bdd9f23e13..d1f1bde85d3f 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -10,7 +10,7 @@ cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ -fPIC -fno-strict-aliasing -mno-red-zone \ -mno-mmx -mno-sse -cflags-$(CONFIG_ARM64) := $(subst -pg,,$(KBUILD_CFLAGS)) +cflags-$(CONFIG_ARM64) := $(subst -pg,,$(KBUILD_CFLAGS)) -fpie cflags-$(CONFIG_ARM) := $(subst -pg,,$(KBUILD_CFLAGS)) \ -fno-builtin -fpic -mno-single-pic-base From a61090a6d556ce3a8024b4b084bd7b41ffd2908e Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 16 Aug 2017 17:47:40 -0700 Subject: [PATCH 1185/3220] UPSTREAM: x86/build: Fix stack alignment for CLang Commit: d77698df39a5 ("x86/build: Specify stack alignment for clang") intended to use the same stack alignment for clang as with gcc. The two compilers use different options to configure the stack alignment (gcc: -mpreferred-stack-boundary=n, clang: -mstack-alignment=n). The above commit assumes that the clang option uses the same parameter type as gcc, i.e. that the alignment is specified as 2^n. However clang interprets the value of this option literally to use an alignment of n, in consequence the stack remains misaligned. Change the values used with -mstack-alignment to be the actual alignment instead of a power of two. cc-option isn't used here with the typical pattern of KBUILD_CFLAGS += $(call cc-option ...). The reason is that older gcc versions don't support the -mpreferred-stack-boundary option, since cc-option doesn't verify whether the alternative option is valid it would incorrectly select the clang option -mstack-alignment.. Signed-off-by: Matthias Kaehlcke Cc: Arnd Bergmann Cc: Bernhard.Rosenkranzer@linaro.org Cc: Greg Hackmann Cc: Kees Cook Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Michael Davidson Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Stephen Hines Cc: Thomas Gleixner Cc: dianders@chromium.org Link: http://lkml.kernel.org/r/20170817004740.170588-1-mka@chromium.org Signed-off-by: Ingo Molnar (cherry picked from commit 8f91869766c00622b2eaa8ee567db4f333b78c1a) Signed-off-by: Greg Hackmann Change-Id: I7991bfed754f5ac10ac8b383c20ec89d56b2afc0 --- arch/x86/Makefile | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index cb6af304d456..d2ec032a649a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -14,9 +14,11 @@ endif # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) - cc_stack_align_opt := -mpreferred-stack-boundary -else ifneq ($(call cc-option, -mstack-alignment=4),) - cc_stack_align_opt := -mstack-alignment + cc_stack_align4 := -mpreferred-stack-boundary=2 + cc_stack_align8 := -mpreferred-stack-boundary=3 +else ifneq ($(call cc-option, -mstack-alignment=16),) + cc_stack_align4 := -mstack-alignment=4 + cc_stack_align8 := -mstack-alignment=8 endif # How to compile the 16-bit code. Note we always compile for -march=i386; @@ -36,7 +38,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) -REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2) +REALMODE_CFLAGS += $(cc_stack_align4) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y) # Align the stack to the register width instead of using the default # alignment of 16 bytes. This reduces stack usage and the number of # alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2) + KBUILD_CFLAGS += $(cc_stack_align4) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -117,7 +119,7 @@ else # default alignment which keep the stack *mis*aligned. # Furthermore an alignment to the register width reduces stack usage # and the number of alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3) + KBUILD_CFLAGS += $(cc_stack_align8) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) From 8e248849793b9b1172ee95aa4000377eeae1a062 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 17 Aug 2017 11:20:47 -0700 Subject: [PATCH 1186/3220] UPSTREAM: x86/build: Use cc-option to validate stack alignment parameter With the following commit: 8f91869766c0 ("x86/build: Fix stack alignment for CLang") cc-option is only used to determine the name of the stack alignment option supported by the compiler, but not to verify that the actual parameter